## 1.3 Course review scraper  
This notebook presents the codes used to scrape course reviews given a course href link. 

### Import libraries

In [2]:
# webscraping libraries
import requests
from bs4 import BeautifulSoup

# other libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 350

### Scrape Reviews

In [3]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python/reviews?star=1

def pages_of_reviews(course_href='/learn/excel-essentials', rating=2):
    url = f'https://www.coursera.org/{course_href}/reviews?star={rating}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)
    last_page = int(soup.find('ul', {'class':'_l6n5rs cui-buttonList'})
                    .find_all('li', {'style':'display:inline-block'})[-2].text)
    return last_page

In [4]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python/reviews?star=1&page=2

def scrape_reviews_by_rating_page(course_href, rating, review_page):
    url = f'https://www.coursera.org/{course_href}/reviews?star={rating}&page={review_page}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)
    list_of_reviews = [review.text for review in soup.find_all('div', {'class':'reviewText'})]
    list_of_dates = [review.text for review in soup.find_all('div', {'class':'dateOfReview p-x-1s m-b-0 text-secondary font-xs'})]
    dict_reviews = {
        'review': list_of_reviews,
        'date_of_review': list_of_dates,
        'rating': rating,
        'course_href': course_href}
    df = pd.DataFrame(dict_reviews)
    return df 

In [5]:
# sample URL: https://www.coursera.org/learn/data-analysis-with-python

def scrape_reviews(course_href):
    df_list = []
    for rating in range(1,6):
        pages = pages_of_reviews(course_href, rating)
        for review_page in range(1, pages+1):
            df = scrape_reviews_by_rating_page(course_href, rating, review_page)
            df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

In [6]:
reviews_machine_learning = scrape_reviews('/learn/machine-learning')

In [8]:
# uncomment to run code
# reviews_machine_learning.to_csv('./data/reviews_machine_learning.csv') 

In [None]:
# end of notebook