# 1.1 Coursera Course List Scraper
This notebook presents the codes used to scrape a list of all data science courses on Coursera. 

## Import libraries

In [None]:
# webscraping libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time 

# other libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 350

In [None]:
# reference: https://towardsdatascience.com/web-scraping-using-selenium-python-8a60f4cf40ab
# reference: https://nbviewer.jupyter.org/github/thefirebanks/CourseraScraper/blob/master/coursera_url_scrapper.ipynb

In [None]:
driver = webdriver.Chrome('../capstone_others/chromedriver')

In [None]:
driver.get('https://www.coursera.org/browse/data-science?facets=entityTypeTag%3ACourses%2CcategoryMultiTag%3Adata-science')

In [None]:
# since there are hidden information in the html that only shows up at scrolling through the page
# I program the scrapping to scrape as scrolling 

# the first step to set the scroll to the beginning of the page
driver.execute_script("window.scrollTo(0, 0)")

# using the height of the page to determine how many scrolls we might need 
last_height = driver.execute_script("return document.body.scrollHeight")
scrolls = last_height // 2300

# set the initial parameters for the scroll-n-scrape
top = 0
bottom = 0 
step = 2300
links_lists = []

In [None]:
# use a for loop to take elements as scrolling through the page
# append the results (list of urls to a list)

for i in range(scrolls):
    # print(i, top, bottom) # the print function here is just to make sure the for-loop is going 
    
    # scroll the page to update the results 
    driver.execute_script(f"window.scrollTo({top}, {bottom})")
    time.sleep(2)
    
    # get the results 
    results = driver.find_elements_by_xpath('//*[@id="rendered-content"]/div/div/div[1]/section/div[3]/section//a')
    links = [result.get_attribute("href") for result in results if "learn/" in result.get_attribute("href")]
    links_lists.append(links)
    
    # update the parameters 
    top = bottom
    bottom = bottom + step

In [None]:
# unpack the urls and save them in a new list
new_list = []

for links in links_lists: 
    for link in links:
        new_list.append(link)

# removing duplicate items and save as a list
final_list = list(set(new_list)) 
# convert the list into a Dataframe
course_urls = pd.DataFrame(final_list)

In [None]:
# save the urls in a DataFrame and export as a csv. 
# course_urls.to_csv('../data/ds_course_urls.csv') # uncomment to save a new csv file

In [None]:
# end of the notebook