In [1]:
# Import required modules
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from datetime import date

In [2]:
# Making request to the site and parsing as beautifulsoup object
r = requests.get("https://www.shawacademy.com/courses")
s = BeautifulSoup(r.text, "lxml")

In [3]:
# Extract title
title = s.find_all("div", "course-name")
title = [title.text for title in title]

# Extract course link
courseLink = s.find_all("div", "course-name")
courseLink = [lnk.find("a").get("href") for lnk in courseLink]

# Extract unit sale
unitSold = s.find_all("div", class_="enroll-count")
unitSold = [sld.text.strip() for sld in unitSold]

# Extract course rating
rating = s.find_all("div", class_="rating")
rating = [rt.find("p").text.strip() for rt in rating]

# Extract course description
des = s.find_all("div", "course-description")
des = [ds.find("p").text.strip() for ds in des]

In [4]:
# Create a df off extracted variables
df = pd.DataFrame({
    "courseTitle":title,
    "courseLink":courseLink,
    "unitSold":unitSold,
    "rating":rating,
    "description":des
})

# Create absolute course links
df.courseLink = "https://www.shawacademy.com" + df.courseLink

In [5]:
# Clean unitSold and cast the required variables into appropriate one
df.unitSold = df.unitSold.str.replace("Graduates", "").str.replace(",", "").astype("int")
df.rating = df.rating.astype("float")

In [6]:
# Let's have a look at our final data
df.head(10)

Unnamed: 0,courseTitle,courseLink,unitSold,rating,description
0,Photography,https://www.shawacademy.com/courses/photograph...,1503225,4.8,Become a professional photographer with comple...
1,iPhone & Android Photography,https://www.shawacademy.com/courses/photograph...,177739,4.6,"Learn how to take high quality, amazing shots ..."
2,Video Editing and Production,https://www.shawacademy.com/courses/photograph...,53755,4.2,Discover everything you need to know about vid...
3,Wedding Photography,https://www.shawacademy.com/courses/photograph...,32749,4.6,Learn how to take the perfect wedding portrait...
4,Adobe Lightroom,https://www.shawacademy.com/courses/photograph...,80128,4.6,Learn how to use Adobe Lightroom and discover ...
5,Online Photoshop,https://www.shawacademy.com/courses/photograph...,211687,4.8,"Master the fundamentals of Photoshop, from the..."
6,Digital Marketing,https://www.shawacademy.com/courses/marketing/...,471518,4.9,Understand different marketing channels and ta...
7,Social Media Marketing,https://www.shawacademy.com/courses/marketing/...,341432,4.8,Learn how to build and launch an effective soc...
8,Creative Writing,https://www.shawacademy.com/courses/marketing/...,37268,4.9,Take your prose to the next level and discover...
9,SEO & Digital Marketing,https://www.shawacademy.com/courses/marketing/...,25435,4.8,Unlock the secrets to growing your visibility ...


In [7]:
# Today's date to save the daata
today = pd.to_datetime("today").strftime("%d_%b_%y")
df.to_excel(f"shawAcademy_{today}.xlsx", index=None)