In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm
from time import sleep
import random
import re

In [4]:
# generate a list of user agents from stored csv
def user_agent_generator():
    agent_list = pd.read_csv('user_agents.csv')['user_agent']
    return agent_list

# test user agent generator
user_agents = user_agent_generator()
user_agents[:5]

0    Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0)...
1    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...
2    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)...
3    Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...
4    Mozilla/5.0 (Windows NT 6.3; Win64; x64) Apple...
Name: user_agent, dtype: object

In [7]:
# Function to create a soup object from Coursera's all-course directory

def make_soup(page):
    url = f"https://www.coursera.org/courses?page={page}&index=prod_all_products_term_optimization"
    
    user_agent = random.choice(user_agents)
    
    response = requests.get(url, 
                            headers={'User-Agent': user_agent}, 
                            timeout=10)
    sleep(5)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [19]:
# test with research results page 1
soup = make_soup(1)
# print(soup.prettify())
print(soup.get_text())

Courses | CourseraExploreFor EnterpriseFor StudentsLog InJoin for FreeBrowse>CourseraCoursera CoursesAllGuided ProjectsDegrees & CertificatesShowing 8413 total resultsOther topics to exploreArts and Humanities338 coursesBusiness1095 coursesComputer Science668 coursesData Science425 coursesInformation Technology145 coursesHealth471 coursesMath and Logic70 coursesPersonal Development137 coursesPhysical Science and Engineering413 coursesSocial Sciences401 coursesLanguage Learning150 coursesWhat Coursera Has to Offerlearning programdescriptionGUIDED PROJECTLearn a job-relevant skill that you can use today in under 2 hours through an interactive experience guided by a subject matter expert. Access everything you need right in your browser and complete your project confidently with step-by-step instructions.COURSETake courses from the world's best instructors and universities. Courses include recorded auto-graded and peer-reviewed assignments, video lectures, and community discussion forums.

#### The research results do not appear in the extracted text
- Looks like the table of results is dynamic content?
- Also, moving to other pages doesn't change the URL query. 

In [20]:
# Functions to extract course results data 

course_list = []

def extract_course(soup):
    results = soup.find_all('a', class_ = 'rc-MobileSearchCard')
    for item in results:
        try:
            partner = item.find('spam', class_ = 'partner-name').text.strip()
        except:
            partner = np.nan
        try:
            title = item.find('h2', class_ = 'color-primary-text card-title headline-1-text').text.strip()
        except:
            title = np.nan
        try:
            learning_product = item.find('div', class_ = '_jen3vs _1d8rgfy3').text.strip()
        except:
            learning_product = np.nan
        try:
            rating = item.find('span', class_ = 'ratings-text').text.strip()
        except:
            rating = np.nan
        try:
            rating_count = item.find('span', class_ = 'ratings-count').text.strip()
        except:
            rating_count = np.nan
        try:
            students = item.find('span', class_ = 'enrollment-number').text.strip()
        except:
            students = np.nan
        try:
            difficulty = item.find('span', class_ = 'difficulty').text.strip()
        except:
            difficulty = np.nan
        try:
            link = item.find('a')['href']
        except:
            link = np.nan
                    
            
        course = {
        'partner': partner,
        'title': title,
        'learning_product': learning_product,
        'rating': rating,
        'rating_count': rating_count,
        'students': students,
        'full_description': full_description,
        'difficulty': difficulty,
        'link': link
    }

        course_list.append(course)

    return course_list

In [21]:
# test on search result page 1
extract_course(soup)

[]

### Try Selenium

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [34]:
driver = webdriver.Chrome(executable_path='/Users/nancybui/Downloads/chromedriver')

url = 'https://www.coursera.org/courses'

driver.get(url)

# wait for all elements to load
driver.implicitly_wait(10)

# extract data
courses = driver.find_elements_by_class_name('rc-MobileSearchCard')

course_list = []

for course in courses:
    try:
        partner = course.find_element_by_xpath('.//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[1]/div[1]/div[1]/span').text.strip()
    except:
        partner = np.nan
    try:
        title = course.find_element_by_xpath('.//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[1]/div[1]/h2').text.strip()
    except:
        title = np.nan
    try:
        learning_product = course.find_element_by_xpath('//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[1]/div[1]/div[2]/div').text.strip()
    except:
        learning_product = np.nan
    try:
        rating = course.find_element_by_xpath('.//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[2]/div[1]/div[1]/div/span[1]').text.strip()
    except:
        rating = np.nan
    try:
        rating_count = course.find_element_by_xpath('.//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[2]/div[1]/div[1]/div/span[2]/span').text.strip()
    except:
        rating_count = np.nan
    try:
        students = course.find_element_by_xpath('.//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[2]/div[1]/div[2]/span/span').text.strip()
    except:
        students = np.nan
    try:
        difficulty = course.find_element_by_xpath('.//*[@id="__next"]/div/div[1]/div/div[2]/div/div[1]/div[2]/div/div/div/div/div/ul/li[1]/div/a/div/div/div[2]/div[2]/span').text.strip()
    except:
        difficulty = np.nan
    try:
        link = course.find_element_by_xpath('.//a[@href="'+url+'"]')
    except:
        link = np.nan

    course = {
    'partner': partner,
    'title': title,
    'learning_product': learning_product,
    'rating': rating,
    'rating_count': rating_count,
    'students': students,
    'full_description': full_description,
    'difficulty': difficulty,
    'link': link
}

    course_list.append(course)
    
# driver.find_element_by_xpath('//*[@id="login"]/button').click()

# driver.quit()
course_list

[]

### Different approach

In [49]:
# Scraping all urls of individual courses - Selenium

def course_url_selenium(page):
    options = webdriver.ChromeOptions() 
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    url = f'https://www.coursera.org/directory/courses?page={page}'
    driver = webdriver.Chrome(options=options, executable_path='/Users/nancybui/Downloads/chromedriver')
    driver.get(url)

#     driver.implicitly_wait(10)
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="rendered-content"]/div/div/div/div/div[2]/div[2]'))).click()
    WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="rendered-content"]/div/div/div/div/div[2]/div[2]')))
    
    courses = []
    hrefs = []
    urls = driver.find_elements_by_class_name('cds-1 cds-135 cds-137 css-19sqvu6 cds-24')
    for item in urls:
        course = item.find_element_by_xpath('.//*[@id="rendered-content"]/div/div/div/div/div[2]/div[2]/div/div[1]/div/div[1]/ul/li[1]/a').text
        href = item.get_attribute(href)
        courses.append(course)
        hrefs.append(href)
    return courses, hrefs
    driver.quit()

In [50]:
course_url_selenium(1)

([], [])

In [41]:
# Scraping all urls of individual courses - BS4

def course_url_bs(page):
    url = f'https://www.coursera.org/directory/courses?page={page}'
    user_agent = random.choice(user_agents)
    response = requests.get(url, 
                            headers={'User-Agent': user_agent}, 
                            timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')
    courses = []
    hrefs = []
    urls = soup.find_all('a', class_ = 'cds-1 cds-135 cds-137 css-19sqvu6 cds-24')
    for item in urls:
        course = item.find('a').text.strip()
        href = item.find('a')['href']
        courses.append(course)
        hrefs.append(href)
    return courses, hrefs

In [42]:
course_url_bs(1)

([], [])

In [55]:
import urllib.request
import requests
import bs4
import os
import time
import warnings

import pandas as pd
import numpy as np

from multiprocessing import Pool
from bs4 import BeautifulSoup
from selenium import webdriver

def scraper(page):
    lst_name = []
    lst_link = []

    driver = webdriver.Chrome(executable_path='/Users/nancybui/Downloads/chromedriver')

    url = f'https://www.coursera.org/directory/courses?page={page}'
    driver.get(url)
    res = driver.execute_script("return document.documentElement.outerHTML")
    driver.quit()

    soup = BeautifulSoup(res, 'lxml')
    c_link = soup.findAll('a', {'class':'cds-1 cds-135 cds-137 css-19sqvu6 cds-24'}, href=True)

    for c in c_link:
        lst_name.append(c.getText())
        lst_link.append(c['href'])

    data_dict = {'Name':lst_name, 'Link':lst_link}
    return data_dict

# warnings.simplefilter('ignore')
# if __name__ == '__main__':
#     start = time.time()

#     choke = np.arange(1, 115, 10)[:-1]
#     lst_name = []
#     lst_link = []
#     data_main = {'Name': lst_name, 'Link':lst_link}
#     for i in choke:
        
#         p = Pool(10)
#         data = p.map(scraper, range(i, i+10))
#         p.terminate()
#         p.join()
        
#         lst_name = []
#         lst_link = []
#         _data = {'Name': lst_name, 'Link':lst_link}
#         for d in data:
#             _data['Name'] += d['Name']
#             _data['Link'] += d['Link']
        
#         data_main['Name'] += _data['Name']
#         data_main['Link'] += _data['Link']

#     print(data_main)
#     end = time.time()
#     print(str(len(data_main['Name'])) + ' items in ' + str(end-start) + ' seconds')

#     pd.DataFrame(data=data_main).to_csv('coursera-course-data.csv')

In [56]:
scraper(1)

{'Name': [], 'Link': []}