In [73]:
import requests
from fake_useragent import UserAgent
import time
import json
import re
import sqlite3
from bs4 import BeautifulSoup
import datetime
import hashlib
#regex for extracting decimals from https://stackoverflow.com/questions/12117024/decimal-number-regular-expression-where-digit-after-decimal-is-optional

In [2]:
courses_table="""CREATE TABLE if not exists "courses" (
	"url"	TEXT UNIQUE,
	"title"	TEXT,
	"instructor"	TEXT,
	"average_rating"	REAL,
	"total_pages"	INTEGER,
	"scraped_pages"	INTEGER,
	PRIMARY KEY("url")
);"""

reviews_table="""
CREATE TABLE if not exists "reviews" (
	"id"	TEXT UNIQUE,
	"course_url"	INTEGER,
	"reviewer"	TEXT,
	"review_date"	TEXT,
	"review_unix"	REAL,
	"helpful_count"	INTEGER,
	"rating"	REAL,
	"review"	INTEGER,
	PRIMARY KEY("id")
);"""

In [102]:
def initialize_db():
    conn=sqlite3.connect('d.db')
    cur=conn.cursor()
    cur.execute(courses_table)
    cur.execute(reviews_table)
    conn.commit()
    cur.close()
    conn.close()

def scrape_course_main_page(course_url,cur,conn,debug=False):
    html=requests.get(course_url).content
    soup=BeautifulSoup(html)
    instructor=soup.find('div',attrs={'class':'rc-BannerInstructorInfo'}).find('span').text.strip()
    average_rating=soup.find('span',attrs={'class':'number-rating'}).text.strip()
    average_rating=re.findall(r'\d+\.?\d*',average_rating)[0]
    title=soup.find('div',attrs={'class':'BannerTitle'}).find('h1').text.strip()
    
    review_html=requests.get(course_url+'/reviews').content
    review_soup=BeautifulSoup(review_html)
    pg_nav_bar=review_soup.find('nav',attrs={'aria-label':'Pagination Controls'}).findAll('span')
    total_pages=int(pg_nav_bar[-2].text)
    
    if debug:
        print("Course title: ",title)
        print("Instructor: ",instructor)
        print("Average course rating: ",average_rating)
        print("Total pages: ",total_pages)
    
    cur.execute("update courses set title=?, instructor=?, average_rating=?, total_pages=? where url=?", (title,instructor,average_rating,total_pages,course_url))
    conn.commit()
    
def scrape_course_review(course_url,pgn,cur,conn,debug=False):
    if pgn==400:
        return#page 400 and after are not accessible
    review_url=course_url+'/reviews?page={}'.format(pgn)
    html=requests.get(review_url).content
    soup=BeautifulSoup(html)
    for review in soup.findAll('div',attrs={'class':'review-page-review'}):
        helpful_count=review.find('button',attrs={'class':'review-helpful-button'}).text
        helpful_count=re.findall(r'\d+',helpful_count)
        if helpful_count!=[]:
            helpful_count=helpful_count[0]
        else:
            helpful_count=0
        review_text=review.find('div',attrs={'class':'reviewText'}).text
        reviewer=review.find('p',attrs={'class':'reviewerName'}).text.replace("By","").strip()
        review_date=review.find('div',attrs={'class':'dateOfReview'}).text
        review_timestamp=datetime.datetime.strptime(review_date,'%b %d, %Y').timestamp()
        star_box=review.find('div',attrs={'role':'img'})
        star_count=len(star_box.findAll('title',string='Filled Star'))
        if debug:
            print("Reviwer: ",reviewer)
            print("Review Date: ",review_date)
            print("Upvote: ",helpful_count)
            print("Rating: ",star_count)
            print("Review: ",review_text)
        uid=review_url+reviewer+review_date+review_text
        uid=hashlib.md5(uid.encode('utf-8')).hexdigest()
        cur.execute("INSERT INTO reviews (id,course_url,reviewer,review_date,review_unix,helpful_count,rating,review) values (?,?,?,?,?,?,?,?) on CONFLICT(id) DO UPDATE SET  id=id",(uid,course_url,reviewer,review_date,review_timestamp,helpful_count,star_count,review_text))
        conn.commit()
    cur.execute("update courses set scraped_pages=? where url=?",(pgn,course_url))
    conn.commit()
    
def scrape_course_list(pgn,cur,conn,debug=False):
    list_url='https://www.coursera.org/search?query=free&page={}&index=prod_all_products_term_optimization'.format(pgn)
    html=requests.get(list_url).content
    soup=BeautifulSoup(html)
    list_obj=json.loads(soup.find('script',attrs={'type':'application/ld+json'}).string)
    for course in list_obj['itemListElement']:
        cur.execute("INSERT INTO courses (url) Values (?) on CONFLICT(url) DO UPDATE SET  url=url",(course['url'],))
        conn.commit()
        if debug:
            print(course['url'])

In [103]:
def scrape_course_meta_data(scrape_course_page=True,scrape_course_list=True,pages_of_courses=10):
    initialize_db()
    conn=sqlite3.connect('d.db')
    cur=conn.cursor()
    if scrape_course_list:
        for i in range(1,pages_of_courses+1):
            scrape_course_list(i,cur,conn)
            time.sleep(5)
    
    if scrape_course_page:
        for i in cur.execute("select url from courses where title is null").fetchall():
            course_url=i[0]
            scrape_course_main_page(course_url,cur,conn)
            time.sleep(5)
        
    cur.close()
    conn.close()

In [110]:
def scrape_reviews():
    conn=sqlite3.connect('d.db')
    cur=conn.cursor()
    for i in cur.execute("select url,total_pages,scraped_pages from courses where total_pages is not null").fetchall():
        course_url,total_pages,scraped_pages=i
        if total_pages==scraped_pages:
            continue
        if scraped_pages==None:
            scraped_pages=0
        for pgn in range(scraped_pages+1,total_pages+1):
            if pgn>10:#only scrape 10 pages for each course to have some diversity
                continue
            scrape_course_review(course_url,pgn,cur,conn)
            time.sleep(5)

In [None]:
scrape_reviews()