In [1]:
# webscraping libraries
import requests
from bs4 import BeautifulSoup

# other libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 350

### Scrape Courses by subject and difficulty level
- Data Science (`data-science`)
- Business (`business`)
- Computer Science (`computer-science`)
- Physical Science and Engineering (`physical-science-and-engineering`)
- Social Science (`social-sciences`)

In [3]:
# define a function scrape the first 16 most popular courses in English for a given subject, difficulty level

'''
https://www.coursera.org/browse/physical-science-and-engineering?facets=difficultyLevelTag%3ABeginner%2Clanguages%3AEnglish%2CentityTypeTag%3ACourses%2CcategoryMultiTag%3A{subject}&'+'sortField=mostPopularByEnrollmentsNumericTag
'''


def cat_pop_16(subject='data-science', level='Beginner'):
    url = (f'https://www.coursera.org/browse/{subject}?facets=difficultyLevelTag%3A{level}'+
       f'%2Clanguages%3AEnglish%2CentityTypeTag%3ACourses%2CcategoryMultiTag%3A{subject}&'+
       'sortField=mostPopularByEnrollmentsNumericTag')
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    href_list = [each.find('a').attrs['href'] for each in soup.find_all('div', {'class': 'offering-wrapper'})]
    
    href_dict = {
        'course_href': href_list,
        'subject': subject,
        'level': level
    }
    
    return pd.DataFrame(href_dict)

In [4]:
subjects = ['data-science', 'business', 'computer-science', 'physical-science-and-engineering', 'social-sciences']
levels = ['Beginner', 'Intermediate', 'Advanced']

list_herf_df = []

for subject in subjects:
    for level in levels:
        list_herf_df.append(cat_pop_16(subject, level))

In [5]:
href_df = pd.concat(list_herf_df, ignore_index=True)

In [34]:
href_df.shape

(222, 3)

In [7]:
href_df.head()

Unnamed: 0,course_href,subject,level
0,/learn/excel-essentials,data-science,Beginner
1,/learn/what-is-datascience,data-science,Beginner
2,/learn/machine-learning-projects,data-science,Beginner
3,/learn/sql-for-data-science,data-science,Beginner
4,/learn/python-for-applied-data-science-ai,data-science,Beginner


### Scrape course Info

In [12]:
# url = f'https://www.coursera.org/learn/what-is-datascience'
# res = requests.get(url) 
# soup = BeautifulSoup(res.content)

In [13]:
# set([each.text for each in soup.find_all('div', {'class': '_1k3yl1y'})])

In [18]:
# course_herf = '/learn/astro'
# https://www.coursera.org/learn/astro

def course_info(course_herf='/learn/what-is-datascience'):
    url = f'https://www.coursera.org{course_herf}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)

    dict_course_info = {
        'course_href': course_herf,
        'course_name': soup.find('h1').text,
        'partner_title': soup.find('h4').text,
        'instructor': soup.find('div', {'class': '_1wpyc64q'}).find('span').text,
        'stars': soup.find('div', {'class': 'rc-ReviewsOverview__totals__rating'}).text,
        'num_reviews': soup.find('span', {'itemprop': 'reviewCount'}).text,
        'description': soup.find('div', {'class': 'm-t-1 description'}).text,
        'outcome': list(set([each.text for each in soup.find_all('div', {'class': '_1k3yl1y'})])),
        'highlight': list(set([each.text for each in soup.find_all('div', {'class': '_16ni8zai m-b-0'})])),
        'length': soup.find_all('div', {'class': '_1tu07i3a'})[-2].find('span').text}
    return dict_course_info

In [19]:
list_info = []

for each in list(href_df['course_href']):
    try:
        list_info.append(course_info(each))
    except:
        pass

course_info = pd.DataFrame(list_info)

In [20]:
course_info.shape

(198, 10)

In [22]:
course_info.head(1)

Unnamed: 0,course_href,course_name,partner_title,instructor,stars,num_reviews,description,outcome,highlight,length
0,/learn/excel-essentials,Excel Skills for Business: Essentials,Macquarie University,Nicky Bull +2 more instructors,4.9,9440,"In this first course of the specialization Excel Skills for Business, you will learn the essentials of Microsoft Excel. Within six weeks, you will be able to expertly navigate the Excel user interface, perform basic calculations with formulas and functions, professionally format spreadsheets, and create visualizations of data through charts and...","[22%started a new career after completing these courses, 22%got a tangible career benefit from this course, 11%got a pay increase or promotion]","[Course 1 of 4 in the, Beginner Level, Flexible deadlines, 100% online, English, Shareable Certificate]",Approx. 26 hours to complete


In [28]:
course_info.shape

(198, 10)

In [38]:
course_df = course_info.merge(href_df, left_on='course_href', right_on='course_href')

In [45]:
course_df.head()

Unnamed: 0,course_href,course_name,partner_title,instructor,stars,num_reviews,description,outcome,highlight,length,subject,level
0,/learn/excel-essentials,Excel Skills for Business: Essentials,Macquarie University,Nicky Bull +2 more instructors,4.9,9440,"In this first course of the specialization Excel Skills for Business, you will learn the essentials of Microsoft Excel. Within six weeks, you will be able to expertly navigate the Excel user interface, perform basic calculations with formulas and functions, professionally format spreadsheets, and create visualizations of data through charts and...","[22%started a new career after completing these courses, 22%got a tangible career benefit from this course, 11%got a pay increase or promotion]","[Course 1 of 4 in the, Beginner Level, Flexible deadlines, 100% online, English, Shareable Certificate]",Approx. 26 hours to complete,data-science,Beginner
1,/learn/excel-essentials,Excel Skills for Business: Essentials,Macquarie University,Nicky Bull +2 more instructors,4.9,9440,"In this first course of the specialization Excel Skills for Business, you will learn the essentials of Microsoft Excel. Within six weeks, you will be able to expertly navigate the Excel user interface, perform basic calculations with formulas and functions, professionally format spreadsheets, and create visualizations of data through charts and...","[22%started a new career after completing these courses, 22%got a tangible career benefit from this course, 11%got a pay increase or promotion]","[Course 1 of 4 in the, Beginner Level, Flexible deadlines, 100% online, English, Shareable Certificate]",Approx. 26 hours to complete,business,Beginner
2,/learn/excel-essentials,Excel Skills for Business: Essentials,Macquarie University,Nicky Bull +2 more instructors,4.9,9440,"In this first course of the specialization Excel Skills for Business, you will learn the essentials of Microsoft Excel. Within six weeks, you will be able to expertly navigate the Excel user interface, perform basic calculations with formulas and functions, professionally format spreadsheets, and create visualizations of data through charts and...","[22%started a new career after completing these courses, 22%got a tangible career benefit from this course, 11%got a pay increase or promotion]","[Course 1 of 4 in the, Beginner Level, Flexible deadlines, 100% online, English, Shareable Certificate]",Approx. 26 hours to complete,data-science,Beginner
3,/learn/excel-essentials,Excel Skills for Business: Essentials,Macquarie University,Nicky Bull +2 more instructors,4.9,9440,"In this first course of the specialization Excel Skills for Business, you will learn the essentials of Microsoft Excel. Within six weeks, you will be able to expertly navigate the Excel user interface, perform basic calculations with formulas and functions, professionally format spreadsheets, and create visualizations of data through charts and...","[22%started a new career after completing these courses, 22%got a tangible career benefit from this course, 11%got a pay increase or promotion]","[Course 1 of 4 in the, Beginner Level, Flexible deadlines, 100% online, English, Shareable Certificate]",Approx. 26 hours to complete,business,Beginner
4,/learn/what-is-datascience,What is Data Science?,IBM,Rav Ahuja +1 more instructor,4.7,8354,"The art of uncovering the insights and trends in data has been around since ancient times. The ancient Egyptians used census data to increase efficiency in tax collection and they accurately predicted the flooding of the Nile river every year. Since then, people working in data science have carved out a unique and distinct field for the work th...","[29%started a new career after completing these courses, 11%got a pay increase or promotion, 30%got a tangible career benefit from this course]","[English, 100% online, Flexible deadlines, Shareable Certificate]",Approx. 10 hours to complete,data-science,Beginner


In [40]:
# course_df.to_csv('./data/course_df.csv')

---
### Scrape Reviews

In [41]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python/reviews?star=1

def pages_of_reviews(course_href='/learn/excel-essentials', rating=2):
    url = f'https://www.coursera.org/{course_href}/reviews?star={rating}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)
    last_page = int(soup.find('ul', {'class':'_l6n5rs cui-buttonList'})
                    .find_all('li', {'style':'display:inline-block'})[-2].text)
    return last_page

In [42]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python/reviews?star=1&page=2

def scrape_reviews_by_rating_page(course_href='/learn/excel-essentials', rating=2, review_page):
    url = f'https://www.coursera.org/{course_href}/reviews?star={rating}&page={review_page}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)
    list_of_reviews = [review.text for review in soup.find_all('div', {'class':'reviewText'})]
    list_of_dates = [review.text for review in soup.find_all('div', {'class':'dateOfReview p-x-1s m-b-0 text-secondary font-xs'})]
    dict_reviews = {
        'review': list_of_reviews,
        'date_of_review': list_of_dates,
        'rating': rating,
        'course_href': course_href}
    df = pd.DataFrame(dict_reviews)
    return df 

In [43]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python

def scrape_reviews(course_href='/learn/excel-essentials'):
    df_list = []
    for rating in range(1,6):
        pages = pages_of_reviews(course_href, rating)
        for review_page in range(1, pages+1):
            df = scrape_reviews_by_rating_page(course_href, rating, review_page)
            df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

In [44]:
testing = scrape_reviews('/learn/excel-essentials')


In [50]:
reviews_ds_beginner = testing.merge(href_df, left_on='course_href', right_on='course_href')

In [51]:
reviews_ds_beginner.to_csv('./data/reviews_ds_beginner')