In [1]:
# webscraping libraries
import requests
from bs4 import BeautifulSoup

# other libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 350

### Scrape Courses by subject and difficulty level
- Data Science (`data-science`)
- Business (`business`)
- Computer Science (`computer-science`)
- Physical Science and Engineering (`physical-science-and-engineering`)
- Social Science (`social-sciences`)

In [2]:
# define a function scrape the first 16 most popular courses in English for a given subject, difficulty level

'''
https://www.coursera.org/browse/physical-science-and-engineering?facets=difficultyLevelTag%3ABeginner%2Clanguages%3AEnglish%2CentityTypeTag%3ACourses%2CcategoryMultiTag%3A{subject}&'+'sortField=mostPopularByEnrollmentsNumericTag
'''


def cat_pop_16(subject, level):
    url = (f'https://www.coursera.org/browse/{subject}?facets=difficultyLevelTag%3A{level}'+
       f'%2Clanguages%3AEnglish%2CentityTypeTag%3ACourses%2CcategoryMultiTag%3A{subject}&'+
       'sortField=mostPopularByEnrollmentsNumericTag')
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    href_list = [each.find('a').attrs['href'] for each in soup.find_all('div', {'class': 'offering-wrapper'})]
    
    href_dict = {
        'course_href': href_list,
        'subject': subject,
        'level': level
    }
    
    return pd.DataFrame(href_dict)

In [3]:
subjects = ['data-science', 'business', 'computer-science', 'physical-science-and-engineering', 'social-sciences']
levels = ['Beginner', 'Intermediate', 'Advanced']

list_herf_df = []

for subject in subjects:
    for level in levels:
        list_herf_df.append(cat_pop_16(subject, level))

In [4]:
href_df = pd.concat(list_herf_df, ignore_index=True)

In [6]:
href_df.shape

(215, 3)

In [7]:
href_df.head()

Unnamed: 0,course_href,subject,level
0,/learn/excel-essentials,data-science,Beginner
1,/learn/what-is-datascience,data-science,Beginner
2,/learn/machine-learning-projects,data-science,Beginner
3,/learn/sql-for-data-science,data-science,Beginner
4,/learn/python-for-applied-data-science-ai,data-science,Beginner


### Scrape course Info

In [78]:
# course_herf = '/learn/astro'
# https://www.coursera.org/learn/astro

def course_info(course_herf):
    url = f'https://www.coursera.org{course_herf}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)

    dict_course_info = {
        'course_href': course_herf,
        'course_name': soup.find('h1').text,
        'partner_title': soup.find('h4').text,
        'instructor': soup.find('div', {'class': '_1wpyc64q'}).find('span').text,
        'stars': soup.find('div', {'class': 'rc-ReviewsOverview__totals__rating'}).text,
        'num_reviews': soup.find('span', {'itemprop': 'reviewCount'}).text,
        'description': soup.find('div', {'class': 'm-t-1 description'}).text,
        'outcome1': soup.find_all('div', {'class': '_1k3yl1y'})[0].text,
        'outcome2': soup.find_all('div', {'class': '_1k3yl1y'})[1].text,
        'certificate': soup.find_all('div', {'class': '_16ni8zai m-b-0'})[0].text,
        'method': soup.find_all('div', {'class': '_16ni8zai m-b-0'})[1].text,
        'deadline': soup.find_all('div', {'class': '_16ni8zai m-b-0'})[2].text,
        'language': soup.find_all('div', {'class': '_16ni8zai m-b-0'})[3].text,
        'length': soup.find_all('div', {'class': '_1tu07i3a'})[-2].find('span').text}
    return dict_course_info

In [79]:
list_info = []

for each in list(href_df['course_href']):
    try:
        list_info.append(course_info(each))
    except:
        pass

course_info = pd.DataFrame(list_info)

---
### Scrape Reviews

In [81]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python/reviews?star=1

def pages_of_reviews(course_name, rating):
    url = f'https://www.coursera.org/learn/{course_name}/reviews?star={rating}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)
    last_page = int(soup.find('ul', {'class':'_l6n5rs cui-buttonList'})
                    .find_all('li', {'style':'display:inline-block'})[-2].text)
    return last_page

In [82]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python/reviews?star=1&page=2

def scrape_reviews_by_rating_page(course_name, rating, review_page):
    url = f'https://www.coursera.org/learn/{course_name}/reviews?star={rating}&page={review_page}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)
    list_of_reviews = [review.text for review in soup.find_all('div', {'class':'reviewText'})]
    list_of_dates = [review.text for review in soup.find_all('div', {'class':'dateOfReview p-x-1s m-b-0 text-secondary font-xs'})]
    dict_reviews = {
        'review': list_of_reviews,
        'date_of_review': list_of_dates,
        'rating': rating,
        'course_name': course_name}
    df = pd.DataFrame(dict_reviews)
    return df 

In [83]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python

def scrape_reviews(course_name):
    df_list = []
    for rating in range(1,6):
        pages = pages_of_reviews(course_name, rating)
        for review_page in range(1, pages+1):
            df = scrape_reviews_by_rating_page(course_name, rating, review_page)
            df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

In [None]:
# testing = scrape_reviews('python-for-data-visualization')


In [None]:
# https://www.coursera.org/learn/python-for-applied-data-science-ai/reviews