In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
from typing import List, Dict
import random,time

In [2]:
n = 0
link_list = []

for i in range(1,20):
    url = "https://www.coursera.org/courses?query=free&page={i}"
    response = requests.get(url)
    page_html = response.content
    page_content = BeautifulSoup(page_html,"html.parser")


    course_container = page_content.find_all("div",attrs={"class":"cds-ProductCard-base cds-ProductCard-grid css-1gwppjr"})
    
    for course in course_container:
        link_containter = course.find("a")["href"]
        link_list.append(link_containter)
        # print(f"{n+1}. {course_name}: {link_containter}")

        n+=1
        if n == 200:
            break

    if n == 200:
            break

In [3]:
def get_page_content(link):
    response = requests.get(link)
    if response.status_code == 200:
        content = BeautifulSoup(response.content,'html.parser')
    return content 

def get_review_info(link,course) -> List[Dict]:
    content = get_page_content(link)
    all_review_course = []
    
    time.sleep(random.uniform(1, 3))

    review_container = content.find_all('div',attrs={'class':'cds-9 review review-text review-page-review m-b-2 css-0 cds-10'})
    for review in review_container:
        review_info = {
            'Course Name': course,
            'Reviewer Name': None,
            'Date of Review': None,
            'Review Content': None,
            'Rating Star': None
        }
        star_container = review.find('div',attrs = {'class':'cds-9 css-1cyk8pe cds-11 cds-grid-item cds-18'})
        review_text = review.find('div',attrs = {'class':'reviewText'})

        star = star_container.find_all('svg',attrs = {'style':"fill:#F2D049;height:14px;width:14px;margin-right:2px;vertical-align:text-bottom"})
        reviewer_con = star_container.find('p',attrs = {'class':'reviewerName p-x-1s css-vac8rf'})
        reviewer_name = reviewer_con.find('span').text
        review_date = star_container.find('p',attrs = {'class':'dateOfReview p-x-1s css-vac8rf'}).text

        reviewtext = review_text.find('span').text

        review_info['Reviewer Name'] = reviewer_name
        review_info["Date of Review"] = review_date
        review_info['Review Content'] = reviewtext
        review_info['Rating Star'] = len(star)

        all_review_course.append(review_info)
    return all_review_course

def get_course_info(link_list):
    base_url = 'https://wwww.coursera.org'

    all_courses =[]
    all_reviews =[]

    for link in link_list:
        course_dict = {
            'Course ID': None,
            'Course Name': None,
            'Unit': None,
            'Review Number': None,
            'Avg Star': None,
            'Level': None,
            'Results': None
        }

        course_link = urljoin(base_url,link)
        content = get_page_content(course_link)

        #get name and unit
        course_name = content.find('h1').text
        
        img = content.find('img',attrs={'class':'css-1f9gt0j'})
        unit = img.get('alt')

        #get courser info: start, level, total review
        course_info = content.find('div',attrs={'data-e2e':'key-information'})
        star = course_info.find('div',attrs={'class':'cds-119 cds-Typography-base css-h1jogs cds-121'}).text
        review = course_info.find_all('p',attrs={'class':'css-vac8rf'})[0].text.split()[0].lstrip('(')
        level = content.find('div',attrs={'class':'css-fk6qfz'}).text

        #get results
        results_container = content.find_all('li',attrs={'class':'cds-9 css-0 cds-11 cds-grid-item cds-56 cds-64'})
        results = [result.find('span').text for result in results_container]

        course_dict['Course ID'] = course_name
        course_dict['Course Name'] = course_name
        course_dict['Unit'] = unit
        course_dict['Review Number'] = review
        course_dict['Avg Star'] = star
        course_dict['Level'] = level
        course_dict['Results'] = '\n'.join(results)

        #add to courses list
        all_courses.append(course_dict)

        #get review parts
        review_link = course_link +'/reviews'
        all_course_review = get_review_info(review_link,course_name)
        for review in all_course_review:
            all_reviews.append(review)

    else:
        df1 = pd.DataFrame(all_courses)
        df2 = pd.DataFrame(all_reviews)

    return df1,df2



In [4]:
part1 = link_list[0:50]
part1_df1,part1_df2 = get_course_info(part1)

In [5]:
part2 = link_list[50:100]
part2_df1,part2_df2 = get_course_info(part2)

In [6]:
part3 = link_list[100:150]
part3_df1,part3_df2 = get_course_info(part3)

In [7]:
part4 = link_list[150:]
part4_df1, part4_df2 = get_course_info(part4)

In [8]:
courses = pd.concat([part1_df1,part2_df1,part3_df1,part4_df1],ignore_index=True)
courses.head()

Unnamed: 0,Course ID,Course Name,Unit,Review Number,Avg Star,Level,Results
0,Business Analysis & Process Management,Business Analysis & Process Management,Coursera Project Network,Recommended,4.4,Beginner level,Analyze business processes and find solutions ...
1,"Python for Data Science, AI & Development","Python for Data Science, AI & Development",IBM,37039,4.6,Beginner level,Learn Python - the most popular programming la...
2,Introduction to Microsoft Excel,Introduction to Microsoft Excel,Coursera Project Network,Recommended,4.6,Intermediate level,Create an Excel spreadsheet and learn how to m...
3,English for Common Interactions in the Workpla...,English for Common Interactions in the Workpla...,Pontificia Universidad Católica de Chile,2200,4.7,8 hours to complete,Describe professions and daily activities rel...
4,Build a free website with WordPress,Build a free website with WordPress,Coursera Project Network,881,4.4,Guided Project,Open a free website on WordPress it and add pa...


In [9]:
reviews = pd.concat([part1_df2,part2_df2,part3_df2,part4_df2],ignore_index=True)
reviews.head()

Unnamed: 0,Course Name,Reviewer Name,Date of Review,Review Content,Rating Star
0,Business Analysis & Process Management,By Thomas G,"Dec 15, 2020","""Pause and look at this doc ok welcome back no...",1
1,Business Analysis & Process Management,By Gisela W,"Dec 19, 2020",I like the hands-on platform and the approach ...,2
2,Business Analysis & Process Management,By Deleted A,"Feb 11, 2021",Coursera please remove this course! It's poor...,1
3,Business Analysis & Process Management,By TUSHAR S,"Feb 1, 2021",This instructor has a very poor style of teach...,1
4,Business Analysis & Process Management,By Ellvina K,"May 7, 2021",Even at the start I was unable to follow throu...,1


In [10]:
# Save to a CSV file
courses.to_csv("courses.csv",index=False)
reviews.to_csv('reviews.csv', index=False)