In [1]:
# Import required modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import re
import numpy as np
import itertools
from datetime import datetime
today = datetime.today().date().strftime("%d_%b")

In [2]:
def generateCoverPageLink(url):
    """Returns sub category cover page links."""
    
    coverPageLink = []
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    r = requests.get(url, headers=headers)
    s = BeautifulSoup(r.text, "lxml")
    for lnk in s.find_all("li", class_="subjects"):
        coverPageLink.append(lnk.find("a").get("href"))
        
    # Drop string "virtual-classrooms" from the scraped urls if any
    coverPageLink = list(map(lambda lnk: lnk.replace("virtual-classrooms/", ""), coverPageLink))
    return coverPageLink



def createPagination(url):
    """Creates pagenation for cover page links.
    url is cover page links and must be iterable."""
    
    allCoverPageLink = []
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    
    # To suppress none type text error
    try:
        r = requests.get(url,headers=headers)
        s = BeautifulSoup(r.text, "lxml")
        totalCourse = s.find(id="coursecountblocks").text
        intTotalCourse = int("".join(re.findall(r"[0-9]+", totalCourse)))
        stopPage = int(np.ceil(intTotalCourse/5))
        
        # Iterate untill stop page to create pagination
        for page in range(1, stopPage+1):
            allCoverPageLink.append(url+f"/?page={page}")
    except:
        pass
    return allCoverPageLink


def scrapeCourseInfo(url):
    """Scrape all the course info. url must be iterable."""
    
    # Variables to scrape
    courseTitle= []
    courseLink = []
    courseProvider = []
    offerPrice = []
    duration = []
    coverPageLink = []
    status = []
    
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
    r = requests.get(url, headers=headers)
    s = BeautifulSoup(r.text, "lxml")
    
    """We might have either active deal or inactive deal."""
    # Main container for active deal
    activeMainCont = s.find_all("div",class_="courses-box prospectsactive")
    # Main container for inactive deal
    inActiveMainCont = s.find_all("div",class_="courses-box nonactive")
    
    # Scrape course courseTitle
    # If the deal is currently active
    if activeMainCont:
            for z in activeMainCont:
                try:
                    # Scrape course title. Based on the presence of title, extract deal status.
                    # If activactiveMainCont is present, deal is currently active.
                    for tlt1 in z.find_all("span", class_="truncate"):
                        courseTitle.append(tlt1.text.strip())
                        status.append("active")
                    
                    # Scrape course provider
                    for prvdr1 in z.find_all("h2"):
                        for prvdr2 in prvdr1.find_all("small"):
                            courseProvider.append(prvdr2.text.strip())
                    
                    # Scrape course duration
                    for drtn1 in z.find_all("span", class_="info2"):
                        duration.append(drtn1.find("strong").text.strip())
                    
                    # Scrape offer price
                    offerPrice.append(z.text.strip())
                    
                    # Scrape individual course links
                    courseLink.append(z.find("h2").find("a").get("href"))
                            
                # Execute this block is deal is missing
                except:
                    courseTitle.append("na")
                    courseProvider.append("na")
                    duration.append("na")
                    offerPrice.append("na")
                    courseLink.append("na")
                    
                    
    # If the deal in currently inactive.             
    if inActiveMainCont:
            for z in inActiveMainCont:
                try:
                    # Scrape course title. Based on the presence of title, extract deal status.
                    # If inActiveMainCont is present, deal is currently inactive.
                    for tlt1 in z.find_all("span", class_="truncate"):
                        courseTitle.append(tlt1.text.strip())
                        status.append("inactive")
                    
                    # Scrape course provider
                    for prvdr1 in z.find_all("h2"):
                        for prvdr2 in prvdr1.find_all("small"):
                            courseProvider.append(prvdr2.text.strip())
                    
                    # Scrape course duration
                    for drtn1 in z.find_all("span", class_="info2"):
                        duration.append(drtn1.find("strong").text.strip())
                        
                    # Scrape offer price
                    offerPrice.append(z.text.strip())
                    
                    # Scrape individual course links
                    courseLink.append(z.find("h2").find("a").get("href"))
                            
                
                except:
                    courseTitle.append("na")
                    courseProvider.append("na")
                    duration.append("na")
                    offerPrice.append("na")
                    courseLink.append("na")
                    
    # Execute this block is deal is missing                
    else:
        courseTitle.append("na")
        status.append("na")
        courseProvider.append("na")
        duration.append("na")
        offerPrice.append("na")
        courseLink.append("na")
    
    # Get cover page links
    for _ in range(len(courseTitle)):
        coverPageLink.append(url)
    
    df = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink":courseLink,
        "courseProvider":courseProvider,
        "coverPageLink":coverPageLink,
        "offerPrice":offerPrice,
        "status":status,
        "duration":duration
    })
    return df

def cleanAndEngineerFeatures(df):
    """Clean and Extract new features.
    df = dataframe to clean and extract features from."""
    
    # Extract offer price
    df.offerPrice = df.offerPrice.str.split("USD").str.get(1).str.strip().str.split(" ").str.get(0)
    
    # Rename offer price
    df.rename(columns={"offerPrice":"offerPriceUSD"}, inplace=True)
    
    # Extract broad and sub category
    df["broadCategory"] = df.coverPageLink.str.split("/").str.get(4)
    df["subCategory"] = df.coverPageLink.str.split("/").str.get(-2)
    
    # Extract page number to check the course ranking
    df["whichPage"] = df.coverPageLink.str.split("=").str.get(-1)
    
    # Drop missing values
    toDrop = df[df.courseTitle=="na"].index
    df.drop(toDrop, axis=0, inplace=True)
    return df

In [3]:
# Scrape all cover pages links
def scrapeCoverPageLink(url):
    
    # Generate cover page links
    coverPageLink = generateCoverPageLink(url)
    
    # Generates all the cover page links with pagination
    with ThreadPoolExecutor() as executor:
        allCoverPageLink = list(executor.map(createPagination, coverPageLink))
        allCoverPageLink = list(itertools.chain(*allCoverPageLink)) # Flattening the list
    return allCoverPageLink

In [4]:
%%time
# Run this once
allCoverPageLink = scrapeCoverPageLink("https://courses.laimoon.com/online/directory")

CPU times: user 2min 8s, sys: 24.9 s, total: 2min 32s
Wall time: 1min 50s


In [5]:
# No. of links to scrape
len(allCoverPageLink)

14424

In [6]:
# Scrape course info in chunks
def main(s1, s2):
    """Since we will scrape in chunks,
    we need to slice all the cover page links.
    s1 = start index, 
    s2 = end index."""
    
    with ThreadPoolExecutor() as executor:
        df = pd.concat(list(executor.map(scrapeCourseInfo, allCoverPageLink[s1:s2]))).reset_index(drop=True)
    
    cleanedDf = cleanAndEngineerFeatures(df)
    return cleanedDf

In [7]:
%%time
# Scrape in chunks
df1 = main(0, 1000)

CPU times: user 4min 30s, sys: 48.1 s, total: 5min 18s
Wall time: 3min 47s


In [8]:
%%time
df2 = main(1000, 2000)

CPU times: user 4min 8s, sys: 36 s, total: 4min 44s
Wall time: 3min 59s


In [9]:
%%time
df3 = main(2000, 3000)

CPU times: user 4min 3s, sys: 35.6 s, total: 4min 39s
Wall time: 3min 56s


In [10]:
%%time
df4 = main(3000, 4000)

CPU times: user 4min 6s, sys: 36.2 s, total: 4min 42s
Wall time: 3min 55s


In [11]:
%%time
df5 = main(4000, 5000)

CPU times: user 4min 34s, sys: 47.4 s, total: 5min 22s
Wall time: 3min 55s


In [12]:
%%time
df6 = main(5000, 6000)

CPU times: user 4min 34s, sys: 47.3 s, total: 5min 21s
Wall time: 3min 55s


In [13]:
%%time
df7 = main(6000, 7000)

CPU times: user 4min 26s, sys: 45.7 s, total: 5min 12s
Wall time: 3min 49s


In [14]:
%%time
df8 = main(7000, 8000)

CPU times: user 4min 29s, sys: 45.8 s, total: 5min 15s
Wall time: 3min 46s


In [15]:
%%time
df9 = main(8000, 9000)

CPU times: user 4min 24s, sys: 45.4 s, total: 5min 10s
Wall time: 3min 46s


In [16]:
%%time
df10 = main(9000, 10000)

CPU times: user 4min 35s, sys: 47 s, total: 5min 22s
Wall time: 3min 54s


In [17]:
%%time
df11 = main(10000, 11000)

CPU times: user 4min 38s, sys: 48.8 s, total: 5min 27s
Wall time: 3min 56s


In [18]:
%%time
df12 = main(11000, 12000)

CPU times: user 4min 35s, sys: 47.6 s, total: 5min 22s
Wall time: 3min 52s


In [19]:
%%time
df13 = main(12000, 13000)

CPU times: user 4min 30s, sys: 46.4 s, total: 5min 17s
Wall time: 3min 48s


In [20]:
%%time
# 13000 to till the end
df14 = main(13000, None)

CPU times: user 6min 21s, sys: 1min 5s, total: 7min 27s
Wall time: 5min 22s


In [21]:
# Concat all the dfs
masterDf = pd.concat([
    df1,
    df2,
    df3,
    df4,
    df5,
    df6,
    df7,
    df8,
    df9,
    df10,
    df11,
    df12,
    df13,
    df14
])

# Drop duplicates if any
masterDf = masterDf.drop_duplicates(keep="first").reset_index(drop=True)

# Let's see what we have
masterDf.head(10)

Unnamed: 0,courseTitle,courseLink,courseProvider,coverPageLink,offerPriceUSD,status,duration,broadCategory,subCategory,whichPage
0,REVIT STRUCTURE,https://courses.laimoon.com/course/autodesk-re...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,374,active,Upto 60 Hours,engineering,civil-engineering,1
1,AUTOCAD,https://courses.laimoon.com/course/autodesk-au...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,286,active,Upto 45 Hours,engineering,civil-engineering,1
2,3DS MAX WITH V-RAY,https://courses.laimoon.com/course/autodesk-3d...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,510,active,Upto 60 Hours,engineering,civil-engineering,1
3,Revit Dynamo with Python Scripting,https://courses.laimoon.com/course/autodesk-re...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,408,active,Upto 40 Hours,engineering,civil-engineering,1
4,SYNCHRO 4D CONSTRUCTION PLANNING,https://courses.laimoon.com/course/synchro-pro...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,490,active,Upto 35 Hours,engineering,civil-engineering,1
5,PRIMAVERA P6,https://courses.laimoon.com/course/advanced-pr...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,381,active,Upto 40 Hours,engineering,civil-engineering,2
6,Rhinoceros with Vray,https://courses.laimoon.com/course/rhinoceros-...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,510,active,Upto 45 Hours,engineering,civil-engineering,2
7,RHINOCEROS WITH GRASSHOPPER,https://courses.laimoon.com/course/rhino-grass...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,667,active,Upto 35 Hours,engineering,civil-engineering,2
8,BIM - Building Information with Revit,https://courses.laimoon.com/course/part-time-b...,Al Mihad Training Centre,https://courses.laimoon.com/online/engineering...,1225,active,Upto 120 Hours,engineering,civil-engineering,2
9,Auto Cad Civil 3D,https://courses.laimoon.com/course/auto-cad-ci...,Elegant Training Center,https://courses.laimoon.com/online/engineering...,429,active,Upto 24 Hours,engineering,civil-engineering,2


In [22]:
# Save as a csv file
masterDf.to_csv(f"laimoon_{today}.csv", index=None)