## 1.2 Coursera Course Information Scraper
This notebook presents the codes used to scrape a the basic course information given the course href links. 

### Import libraries

In [None]:
# webscraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time

# other libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 350

### Import the urls generated from the previous step

In [None]:
course_url = pd.read_csv('./data/ds_course_urls.csv', header=0, names=['url'])

# course_url.drop(columns=['Unnamed: 0'], inplace=True)
course_url.dropna(inplace=True)
course_url.head()

In [None]:
# remove 'https://www.coursera.org' from the urls 
course_url['course_href'] = course_url['url'].map(lambda x: x.replace('https://www.coursera.org', ''))

In [None]:
course_url.head()

### Scrape course info with BeautifulSoup

In [None]:
# The below function scrapes the course information with the given url.

# course_herf = '/learn/astro'
# https://www.coursera.org/learn/astro

def course_info(course_herf):
    url = f'https://www.coursera.org{course_herf}'
    res = requests.get(url) 
    soup = BeautifulSoup(res.content)

    dict_course_info = {
        'course_href': course_herf,
        'course_name': soup.find('h1').text,
        'partner_title': soup.find('h4').text,
        'stars': soup.find('div', {'class': 'rc-ReviewsOverview__totals__rating'}).text,
        'recent_views': soup.find('div', {'class': 'rc-ProductMetrics'}).text,
        'num_ratings': soup.find('div', {'class': '_1srkxe1s XDPRating'}).find_all('div')[3].text,
        'num_reviews': soup.find('span', {'itemprop': 'reviewCount'}).text,
        'description': soup.find('div', {'class': 'm-t-1 description'}).text,
        'outcome': list(set([each.text for each in soup.find_all('div', {'class': '_1k3yl1y'})])),
        'highlight': list(set([each.text for each in soup.find_all('div', {'class': '_16ni8zai m-b-0'})])),
        'length': soup.find_all('div', {'class': '_1tu07i3a'})[-2].find('span').text}
    
    return dict_course_info

### Scrape course information of each of the courses in the `course_url` DataFrame

In [None]:
href_list = list(course_url['course_href'])

In [None]:
# using a forloop to scrape course info from a given list

list_info = []

for each in href_list:
    try:
        list_info.append(course_info(each))
    except:
        pass

course_info = pd.DataFrame(list_info)

In [None]:
# uncomment to run
# course_info.shape
# course_info.to_csv('./data/ds_course_df.csv')

### Scrape Course Info with Selenium

In [None]:
driver = webdriver.Chrome('../capstone_others/chromedriver')

In [None]:
driver.get('https://www.coursera.org/learn/exploratory-data-analysis')

In [None]:
driver.find_elements_by_xpath('//*[@id="main"]/div/div[1]/div[1]/div[1]/div/div/div[2]/div[2]/div/div[2]/div/div/span/strong/span')[0].text

In [None]:
def course_info_sel(course_herf):
    course_info_dict = {
    'course_href': course_herf,
    'enrollment': driver.find_elements_by_xpath('//*[@id="main"]/div/div[1]/div[1]/div[1]/div/div/div[2]/div[2]/div/div[2]/div/div/span/strong/span')[0].text}
    return course_info_dict

In [None]:
course_info_sel('/learn/exploratory-data-analysis')

In [None]:
list_enrollment = []

for each in href_list:
    try:
        driver.get(f'https://www.coursera.org{each}')
        enrollment = course_info_sel(each)
        # print(enrollment) # printing for debugging purpose
        list_enrollment.append(enrollment)
    except:
        pass

enrollment_info = pd.DataFrame(list_enrollment)

In [1]:
# uncomment to run
# enrollment_info.shape
# enrollment_info.to_csv('./data/ds_enrollment.csv')

In [2]:
# end of notebook