## Purpose: To scrape data from udemy with instructor information.

In [1]:
# Import required modules
from requests_html import HTMLSession
from selenium import webdriver
import time
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor

In [2]:
# This function scrapes broad category links from the main ulr of Udemy
def getBraodCatLink(url):
    """url = https://www.udemy.com,
    return = broad category(like development, photography, music etc) links"""
    
    broadCatLink = []
    driver = webdriver.Chrome(r"/home/faysal/Documents/utilities/chromedriver")
    driver.get(url)
    time.sleep(4)
    links = driver.find_elements_by_class_name("category-link-tag--category-card-wrapper--I27oz.category-card--category-card--3x2z6 a")
    for lnk in links:
        broadCatLink.append(lnk.get_attribute("href"))
    broadCatLink = list(set(broadCatLink))
    driver.close()
    return broadCatLink

In [3]:
# This function scrapes sub category links from every broad category link.
def getSubCatLink(urls, page, sortBy):
    """urls = urls of broad category links(must be a iterable),
    page = no of pages to scrape for each broad category link,
    sortBy = sort the page by highest-rated, popularity or newest,
    return = subcategory links"""
    
    subCatLink = []
    driver = webdriver.Chrome(r"/home/faysal/Documents/utilities/chromedriver")
    for lnk in urls:
        driver.get(lnk)
        time.sleep(4)
        links = driver.find_elements_by_class_name("udlite-btn.udlite-btn-small.udlite-btn-ghost.udlite-heading-sm.link-bar--nav-button--3uJ__")
        for lnk in links[1:]:
            for p in range(1, page+1):
                subCatLink.append(f'{lnk.get_attribute("href")}?p={p}&sort={sortBy}')
    driver.close()
    return subCatLink

In [4]:
# This function scrapes individual course links from every sub category link
def getCourseLink(urls):
    """urls = subcategory links(must be iterable),
    return = individual course link"""
    
    courseLink = []
    driver = webdriver.Chrome(r"/home/faysal/Documents/utilities/chromedriver")
    for lnk in urls:
        driver.get(lnk)
        time.sleep(4)
        links = driver.find_elements_by_css_selector("div.popover--popover--t3rNO.popover--popover-hover--14ngr a")
        for lnk in links:
            courseLink.append(lnk.get_attribute("href"))
    # Filter links that are not course links      
    courseLink = list(filter(lambda x: "/course/" in x, courseLink))
    driver.close()
    return courseLink

In [5]:
# This function scrapes individual course info from individual course link
def scrapeCourseInfo(url):
    """url = individual course link.
    return = course info as a dataframe"""
    
    # Initialize variables to scrape
    courseTitle = []
    subtitle = []
    unitSold = []
    typeStarRating = []
    originalPrice = []
    discountOfferAndOriginalPrice = []
    lastPublishedOrUpdated = []
    broadCategory = []
    subCategory = []
     
        
    s = HTMLSession()
    r = s.get(url)
    
    # Course title
    try:
        courseTitle.append(r.html.find("div.clp-component-render h1", first=True).text)
    except:
        courseTitle.append("na")
    
    # Subtitle
    try:
        subtitle.append(r.html.find("div.udlite-text-md.clp-lead__headline", first=True).text)
    except:
        subtitle.append("na")
    
    
    # Unit sold    
    try:
        unitSold.append(r.html.find("div.clp-lead__badge-ratings-enrollment div.clp-component-render")[-1].text)
    except:
        unitSold.append("na")
        
        
    # Type, star and rating    
    try:
        typeStarRating.append(r.html.find("div.clp-lead__badge-ratings-enrollment", first=True).text)
    except:
        typeStarRating.append("na")

    
    # original price
    try:
        originalPrice.append(r.html.find("div.price-text--price-part--Tu6MH.price-text--original-price--2e-F5.udlite-clp-list-price.udlite-text-sm", first=True).text)
    except:
        originalPrice.append("na")
        
    
    # Offer price, original price and discount
    try:
        discountOfferAndOriginalPrice.append(r.html.find("div.price-text--container--Ws-fP.udlite-clp-price-text", first=True).text)
    except:
        discountOfferAndOriginalPrice.append("na")
        
    
    # Published date or last updated date
    try:
        lastPublishedOrUpdated.append(r.html.find("div.last-update-date", first=True).text)
    except:
        lastPublishedOrUpdated.append("na")
        
    # Broad category
    try:
        broadCategory.append(r.html.find("div.topic-menu.udlite-breadcrumb a.udlite-heading-sm")[0].text)
    except:
        broadCategory.append("na")
    
    
    # Sub category
    try:
        subCategory.append(r.html.find("div.topic-menu.udlite-breadcrumb a.udlite-heading-sm")[1].text)
    except:
        subCategory.append("na")
    
    # Create a df off scraped data
    df = pd.DataFrame({
        "courseTitle":courseTitle,
        "subtitle":subtitle,
        "courseLink":url,
        "unitSold":unitSold,
        "originalPrice":originalPrice,
        "discountOfferAndOriginalPrice":discountOfferAndOriginalPrice,
        "lastPublishedOrUpdated":lastPublishedOrUpdated,
        "typeStarRating":typeStarRating,
        "broadCategory":broadCategory,
        "subCategory":subCategory
    })
    
    # Extract offer price
    df["offerPrice"] = pd.to_numeric(df.discountOfferAndOriginalPrice.str.split("\n").str[0].str.split("$").str[-1].str.replace(",", ""), errors="coerce").fillna(0)
    
    # Clean unit sale
    df.unitSold = pd.to_numeric(df.unitSold.str.split(" ").str[0].str.replace(",", ""), errors="coerce").fillna(0).astype(int)
    
    # Clean original price
    df.originalPrice = pd.to_numeric(df.originalPrice.str.split("\n").str[-1].str.split("$").str[-1].str.replace(",", ""), errors="coerce").fillna(0)
    
    # Drop unnecessary columns
    df.drop("discountOfferAndOriginalPrice", axis=1, inplace=True)
    
    # Extract if a course is new, new & hot, or bestseller
    df["type"] = np.where(df.typeStarRating.str.split("\n").str[0].str.contains("Hot & New"), "hotAndNew",
                          np.where(df.typeStarRating.str.split("\n").str[0].str.contains("New"), "new",
                                   np.where(df.typeStarRating.str.split("\n").str[0].str.contains("Bestseller"), "bestseller", "na")))
    # Get instructor links
    df["instructorLink"] = df.courseLink + "#instructor-1"
        
    return df

In [6]:
# Wrap all the function inside main
def main(url, page, sortBy):
    """url = https://www.udemy.com/, 
    page = total pages to scrape,
    sortBy = the requested page to sort by,
    return = indvidual course links."""
    
    # Scrape broad category links
    broadCatLink = getBraodCatLink(url)
    
    # Scrape subcategory links
    subCatLink = getSubCatLink(broadCatLink, page, sortBy)
    
    # Scrape individual course links
    courseLink = getCourseLink(subCatLink)
    return courseLink

# Call the function to scrape individual course links
courseLink = main("https://www.udemy.com/", 1, "newest")

In [7]:
# This function scrapes course data by chunks of course link
def getCourseData(s1, s2):
    """s1 = start index of course link,
    s2 = end index of course link"""
    
    with ProcessPoolExecutor(max_workers=6) as ex:
        masterDf = pd.concat(list(ex.map(scrapeCourseInfo, courseLink[s1:s2]))).reset_index(drop=True)
    return masterDf

# Call the function to scrape course data by chunks of course link
c1 = getCourseData(0, None)

In [8]:
# This function scrapes instructors details
def scrapeInstructorInfo(url):
    """url = instructor links,
    return = instructor info"""
    
    # Variables to scrape from instructor links
    instructorName = []
    designation = []
    instructorOtherInfo = []
    
    s = HTMLSession()
    r = s.get(url)
    
    # Instructor name
    try:
        instructorName.append(r.html.find("div.udlite-heading-lg.instructor--instructor__title--34ItB", first=True).text)
    except:
        instructorName.append("na")
    
    
    # Instructor designation
    try:
        designation.append(r.html.find("div.udlite-text-md.instructor--instructor__job-title--1HUmd", first=True).text)
    except:
        designation.append("na")
        
    
    # Instructor opher info
    try:
        instructorOtherInfo.append(r.html.find("div.instructor--instructor__image-and-stats--1IqE7", first=True).text)
    except:
        instructorOtherInfo.append("na")
        
        
    # Create a df off scraped data
    df = pd.DataFrame({
        "instructorName":instructorName,
        "designation":designation,
        "instructorOtherInfo":instructorOtherInfo,
        "instructorLink":url
    })
    
    # Extract instructor rating, review, total student, total course from instructorOtherInfo
    df["instructorRating"] = pd.to_numeric(df.instructorOtherInfo.str.split("\n").str[0].str.split(" ").str[0], errors="coerce").fillna(0)
    df["instructorReview"] = pd.to_numeric(df.instructorOtherInfo.str.split("\n").str[1].str.split(" ").str[0].str.replace(",", ""), errors="coerce").fillna(0)
    df["instructorStudent"] = pd.to_numeric(df.instructorOtherInfo.str.split("\n").str[2].str.split(" ").str[0].str.replace(",", ""), errors="coerce").fillna(0)
    df["instructorTotalCourse"] = pd.to_numeric(df.instructorOtherInfo.str.split("\n").str[-1].str.split(" ").str[0].str.replace(",", ""), errors="coerce").fillna(0)
    df.drop("instructorOtherInfo", axis=1, inplace=True)
    return df

In [9]:
# This function scrapes instructor details and merge it with the course data
def getFinalData(df):
    """df = course data to merge with the instructor data.
    return = final dataframe of course data and instructor data."""
    
    # This portion get the instructor details
    with ProcessPoolExecutor(max_workers=6) as ex:
        insDf = pd.concat(list(ex.map(scrapeInstructorInfo, df.instructorLink))).reset_index(drop=True)
        
    # Merge instructor data and course data on instructorLink
    masterDf = pd.merge(df, insDf, on="instructorLink")
    
    # Drop duplicates by 
    masterDf = masterDf.drop_duplicates(["courseLink","courseTitle", "instructorName"])
    return masterDf

In [10]:
# Call the function
masterDf1 = getFinalData(c1)
masterDf1.head(10)

Unnamed: 0,courseTitle,subtitle,courseLink,unitSold,originalPrice,lastPublishedOrUpdated,typeStarRating,broadCategory,subCategory,offerPrice,type,instructorLink,instructorName,designation,instructorRating,instructorReview,instructorStudent,instructorTotalCourse
0,Build Your Own Website from Scratch through Wo...,Learn in few easy steps how to build your own ...,https://www.udemy.com/course/build-your-own-fr...,1,19.99,Last updated 7/2020,New\nRating: 0.0 out of 50.0\n(0 ratings)\n1 s...,Design,Web Design,12.99,new,https://www.udemy.com/course/build-your-own-fr...,Savankumar Belamkar,Real Estate professional and Part time blogger,0.0,0.0,1.0,1
1,Make a WordPress Website for Your Business Thi...,Learn how to quickly and easily make a profess...,https://www.udemy.com/course/make-a-wordpress-...,9,54.99,Last updated 7/2020,New\nRating: 0.0 out of 50.0\n(0 ratings)\n9 s...,Design,Web Design,12.99,new,https://www.udemy.com/course/make-a-wordpress-...,Nicole Sauk,Website Designer,0.0,0.0,9.0,1
2,Angular Crash Course for Beginner Developers,Learn all the important concepts you need to g...,https://www.udemy.com/course/angular-crash-cou...,0,129.99,Published 8/2020,New\nRating: 0.0 out of 50.0\n(0 ratings)\n0 s...,Design,Web Design,12.99,new,https://www.udemy.com/course/angular-crash-cou...,THE K INSTRUCTOR,"programmer, developer, instructor",0.0,0.0,0.0,2
3,UX/UI Design Fundamentals. From Beginner to In...,Understanding Design at a deep level and stren...,https://www.udemy.com/course/uxui-design-funda...,5,49.99,Last updated 5/2020,New\nRating: 4.0 out of 54.0\n(1 rating)\n5 st...,Design,Web Design,12.99,new,https://www.udemy.com/course/uxui-design-funda...,Alex Matei,UX/UI Designer,4.0,1.0,5.0,1
4,Build Responsive Website using HTML5 and CSS3 ...,Build responsive websites with HTML5 and CSS3,https://www.udemy.com/course/build-responsive-...,3491,49.99,Last updated 8/2020,"New\nRating: 4.3 out of 54.3\n(23 ratings)\n3,...",Design,Web Design,12.99,new,https://www.udemy.com/course/build-responsive-...,Hani Mohammadi,CG & Aerospace,4.4,146.0,9127.0,5
5,On Page SEO Tactics for Wordpress Bloggers,How to increase search engine rankings by opti...,https://www.udemy.com/course/on-page-seo-tacti...,8045,19.99,Last updated 7/2020,"New\nRating: 3.8 out of 53.8\n(27 ratings)\n8,...",Design,Web Design,12.99,new,https://www.udemy.com/course/on-page-seo-tacti...,Sesan Oguntade,Personal Development,3.9,74.0,14538.0,10
6,WordPress for Beginners,A Beginner to Expert Course on WordPress. Cove...,https://www.udemy.com/course/beginners-course-...,11352,19.99,Last updated 8/2020,New\nRating: 4.5 out of 54.5\n(49 ratings)\n11...,Design,Web Design,12.99,new,https://www.udemy.com/course/beginners-course-...,Tariq SP,Entrepreneur with 16+ yrs of experience in Sal...,4.3,505.0,47986.0,3
7,Web Typography for Designers & Developers,Master the most essential building block of an...,https://www.udemy.com/course/web-typography-fo...,628,129.99,Last updated 8/2020,New\nRating: 4.8 out of 54.8\n(6 ratings)\n628...,Design,Web Design,12.99,new,https://www.udemy.com/course/web-typography-fo...,Rob Sutcliffe,UI Designer 7 Developer,4.5,4954.0,24986.0,3
8,Quickly build a WordPress website for business...,Use this system to build a responsive WordPres...,https://www.udemy.com/course/quickly-build-a-w...,0,24.99,Last updated 8/2020,New\nRating: 0.0 out of 50.0\n(0 ratings)\n0 s...,Design,Web Design,12.99,new,https://www.udemy.com/course/quickly-build-a-w...,David Sharkey,Professional WordPress Web Designer and Digita...,4.6,336.0,1152.0,4
9,WordPress Course: The Complete Guide (Step by ...,Learn to build WordPress websites from scratch...,https://www.udemy.com/course/learn-complete-wo...,14477,74.99,Last updated 7/2020,New\nRating: 4.3 out of 54.3\n(122 ratings)\n1...,Design,Web Design,12.99,new,https://www.udemy.com/course/learn-complete-wo...,Fatah Gabrial,Professional Trainer,4.3,2469.0,132389.0,21
