In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# BeautifulSoup
The source is a dynamic website, thus using BeautifulSoup won't reveal our desired elements

In [57]:
url = "https://ocw.mit.edu/search/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
# content won't show up

<!DOCTYPE html>

<html lang="en">
<head>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
    new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
    j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
    'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
    })(window,document,'script','dataLayer','GTM-NMQZ25T');</script>
<link href="/static_shared/css/www.64cff.css" rel="stylesheet"/>
<link href="/static_shared/css/common.64cff.css" rel="stylesheet"/>
<link href="//cdn-images.mailchimp.com/embedcode/classic-061523.css" rel="stylesheet" type="text/css">
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, viewport-fit=cover" name="viewport"/>
<meta content="MIT OpenCourseWare is a web based publication of virtually all MIT course content. OCW is open and available to the world and is a permanent MIT activity" name="description"/>
<meta content="opencourseware,MIT OCW,cours

# Selenium

Better for dynamic content. However, all roughly 2000 courses do not show up immediately. New courses will show up as users scroll down. To get courses more than what the page initially shows, we have to simulate scrolling using Selenium. 

However, did not manage to get all courses, only 710 courses.

In [88]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [81]:
def scroll_down(driver):
    """A method for scrolling the page."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # scroll to  bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # wait to load page
        time.sleep(6)
        # calculate new height and compare with last height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [82]:
url = "https://ocw.mit.edu/search/"
try:
    driver.get(url)

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@aria-label="OpenCourseWare Search Results"]'))
    )

    scroll_down(driver)

    url_elements = driver.find_elements(By.XPATH, '//*[@class="lr-row course-title"]/a')
    course_urls = []

    for el in url_elements:
        course_url = el.get_attribute('href')
        course_urls.append(course_url)
except:
    print("An error occurred while scraping the course URLs.")

In [83]:
len(course_urls)

2568

Fetched all 2568 URLs/courses. Next, loop through all URLs to get title, description, etc.

In [None]:
courses = []
for url in course_urls:
    try:
        driver.get(url)
        
        # wait to ensure the page is fully loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'course-number-term-detail'))
        )
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # extract course details, all id and class names are based on the current structure of the MIT OCW website
        number_term_level = soup.select_one('.course-number-term-detail').text
        number = number_term_level.split(' | ')[0]
        semester = number_term_level.split(' | ')[1].split()[0]
        year = number_term_level.split(' | ')[1].split()[1]
        level = number_term_level.split(' | ')[2]
        title = soup.select_one('#course-banner > div > div > div:nth-of-type(1) > h1 > a').text.strip()
        description = soup.select('.description')[-1].text.replace("Show less", "")
        
        # there could be more than one instructor, department, and topic
        instructors = [el.text for el in soup.select('#course-main-content .course-info-instructor')]
        departments = [el.text for el in soup.select('#course-main-content .course-info-department')]
        topics = [el.text for el in soup.select('#course-main-content .course-info-topic')]
    
        courses.append({"number": number,
                        "semester": semester,
                        "year":year,
                        "level": level,
                        "title": title,
                        "description": description,
                        "instructors": instructors,
                        "departments": departments,
                        "topics": topics})
        
        time.sleep(4)
    except:
        print(f"An error occurred while scraping the course details from {url}.")
        continue

In [None]:
#result is here
courses

[{'number': '14.44',
  'semester': 'Spring',
  'year': '2007',
  'level': 'Undergraduate',
  'title': 'Energy Economics',
  'description': 'This course explores the theoretical and empirical perspectives on individual and industrial demand for energy, energy supply, energy markets, and public policies affecting energy markets. It discusses aspects of the oil, natural gas, electricity, and nuclear power sectors and examines energy tax, price regulation, deregulation, energy efficiency and policies for controlling emission.',
  'instructors': ['Prof. Paul Joskow'],
  'departments': ['Economics'],
  'topics': ['Energy',
   'Electricity',
   'Fossil Fuels',
   'Nuclear',
   'Science',
   'Earth Science',
   'Sustainability',
   'Social Science',
   'Economics',
   'Microeconomics',
   'Political Economy',
   'Public Administration',
   'Environmental Policy']},
 {'number': '21A.101J (formerly 21A.218J)',
  'semester': 'Spring',
  'year': '2010',
  'level': 'Undergraduate',
  'title': 'Iden

# API
API can be used to retrieve all courses easily. API is found through the network tab of inspect element. 

In [61]:
url = "https://open.mit.edu/api/v0/search/"
# change the size parameters to course max count
body = {"from":0,"size": 2568,"post_filter":{"bool":{"must":[{"bool":{"should":[{"term":{"object_type.keyword":"course"}}]}},{"bool":{"should":[{"term":{"offered_by":"OCW"}}]}}]}},"query":{"bool":{"should":[{"bool":{"filter":{"bool":{"must":[{"term":{"object_type":"course"}}]}}}}]}},"aggs":{"agg_filter_topics":{"filter":{"bool":{"should":[{"bool":{"filter":{"bool":{"must":[{"bool":{"should":[{"term":{"object_type.keyword":"course"}}]}},{"bool":{"should":[{"term":{"offered_by":"OCW"}}]}}]}}}}]}},"aggs":{"topics":{"terms":{"field":"topics","size":10000}}}},"agg_filter_department_name":{"filter":{"bool":{"should":[{"bool":{"filter":{"bool":{"must":[{"bool":{"should":[{"term":{"object_type.keyword":"course"}}]}},{"bool":{"should":[{"term":{"offered_by":"OCW"}}]}}]}}}}]}},"aggs":{"department_name":{"terms":{"field":"department_name","size":10000}}}},"agg_filter_level":{"filter":{"bool":{"should":[{"bool":{"filter":{"bool":{"must":[{"bool":{"should":[{"term":{"object_type.keyword":"course"}}]}},{"bool":{"should":[{"term":{"offered_by":"OCW"}}]}}]}}}}]}},"aggs":{"level":{"nested":{"path":"runs"},"aggs":{"level":{"terms":{"field":"runs.level","size":10000},"aggs":{"courses":{"reverse_nested":{}}}}}}}},"agg_filter_course_feature_tags":{"filter":{"bool":{"should":[{"bool":{"filter":{"bool":{"must":[{"bool":{"should":[{"term":{"object_type.keyword":"course"}}]}},{"bool":{"should":[{"term":{"offered_by":"OCW"}}]}}]}}}}]}},"aggs":{"course_feature_tags":{"terms":{"field":"course_feature_tags","size":10000}}}}}}
headers = {
    # introduce as bot
    "User-Agent": "Scraping Bot", 
    "Accept": "application/json"
}

try:
    result = requests.post(url, json = body)
    result = result.json()
except:
    print("An error occurred while fetching the course data from the API.")
    result = {}

In [None]:
# inspect the json structure to retrieve only the fields we want
result

{'took': 284,
 'timed_out': False,
 '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
 'hits': {'total': 2568,
  'max_score': 0.0,
  'hits': [{'_index': 'discussions_course_04a2ab5166654d39b8fbe8b12222e18a',
    '_type': '_doc',
    '_id': 'co_ocw_NDM1YTk3ZmRlYmYwZDM2MzEyYmVmOGQ3MmRkYWQ3NjUrMTQuNDQ',
    '_score': 0.0,
    '_source': {'id': 7751,
     'course_id': '435a97fdebf0d36312bef8d72ddad765+14.44',
     'coursenum': '14.44',
     'short_description': 'This course explores the theoretical and empirical perspectives on individual and industrial demand for energy, energy supply, energy markets, and public policies affecting energy markets. It discusses aspects of the oil, natural gas, electricity, and nuclear power sectors and examines energy tax, price regulation, deregulation, energy efficiency and policies for controlling emission.',
     'full_description': None,
     'platform': 'ocw',
     'title': 'Energy Economics',
     'image_src': '/courses/14-44-energ

In [65]:
api_courses = []

for data in result['hits']['hits']:
    data = data['_source']
    course = {}
    course['number'] = data['coursenum'].split()[0]
    course['title'] = data['title']
    course['level'] = data['runs'][0]['level']
    course['semester'] = data['runs'][0]['semester']
    course['year'] = data['runs'][0]['year']
    course['description'] = data['runs'][0]['short_description']
    course['topics'] = data['topics']
    course['instructors'] = data['runs'][0]['instructors']
    course['department_name'] = data['department_name']

    api_courses.append(course)

In [66]:
api_courses

[{'number': '14.44',
  'title': 'Energy Economics',
  'level': ['Undergraduate'],
  'semester': 'Spring',
  'year': 2007,
  'description': 'This course explores the theoretical and empirical perspectives on individual and industrial demand for energy, energy supply, energy markets, and public policies affecting energy markets. It discusses aspects of the oil, natural gas, electricity, and nuclear power sectors and examines energy tax, price regulation, deregulation, energy efficiency and policies for controlling emission.',
  'topics': ['Science',
   'Economics',
   'Social Science',
   'Public Administration',
   'Earth Science',
   'Energy',
   'Microeconomics',
   'Political Economy',
   'Fossil Fuels',
   'Nuclear',
   'Electricity',
   'Environmental Policy',
   'Sustainability'],
  'instructors': ['Prof. Paul Joskow'],
  'department_name': ['Economics']},
 {'number': '21A.101J',
  'title': 'Identity and Difference',
  'level': ['Undergraduate'],
  'semester': 'Spring',
  'year': 

# Text Processing

In [67]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def lowering(text: str) -> str:
    text = text.lower()
    return text

def remove_punctuation_and_symbol(text: str) -> str:
    text = re.sub(r'[^\w\s]', '', text)
    return text

def stopword_removal(text: str) -> str:
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# use lemmatization instead of stemming for better accuracy and context understanding
def lemmatization(text: str) -> str:
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

def preprocessing(text: str) -> str:

    text = lowering(text)
    text = remove_punctuation_and_symbol(text)
    text = stopword_removal(text)
    text = lemmatization(text)

    return text

In [68]:
# text fields that can be used for text vectorization are title, description, topics, and department_name

for course in api_courses:
    course['preprocessed_description'] = preprocessing(course['description'])
    course['preprocessed_title'] = preprocessing(course['title'])
    course['preprocessed_topics'] = [preprocessing(topic) for topic in course['topics']]
    course['preprocessed_department_name'] = [preprocessing(department) for department in course['department_name']]

In [75]:
print("Original description:", api_courses[0]['description'])
print("Preprocessed description:", api_courses[0]['preprocessed_description'])

Original description: This course explores the theoretical and empirical perspectives on individual and industrial demand for energy, energy supply, energy markets, and public policies affecting energy markets. It discusses aspects of the oil, natural gas, electricity, and nuclear power sectors and examines energy tax, price regulation, deregulation, energy efficiency and policies for controlling emission.
Preprocessed description: course explores theoretical empirical perspective individual industrial demand energy energy supply energy market public policy affecting energy market discus aspect oil natural gas electricity nuclear power sector examines energy tax price regulation deregulation energy efficiency policy controlling emission


In [76]:
print("Original title:", api_courses[0]['title'])
print("Preprocessed title:",api_courses[0]['preprocessed_title'])

Original title: Energy Economics
Preprocessed title: energy economics


In [77]:
print("Original topics:", api_courses[0]['topics'])
print("Preprocessed topics:", api_courses[0]['preprocessed_topics'])

Original topics: Science, Economics, Social Science, Public Administration, Earth Science, Energy, Microeconomics, Political Economy, Fossil Fuels, Nuclear, Electricity, Environmental Policy, Sustainability
Preprocessed topics: science, economics, social science, public administration, earth science, energy, microeconomics, political economy, fossil fuel, nuclear, electricity, environmental policy, sustainability


In [78]:
print("Original department:",api_courses[0]['department_name'])
print("Preprocessed department:", api_courses[0]['preprocessed_department_name'])

Original department: Economics
Preprocessed department: economics


# Store Data
Akan disimpan sebagai CSV karena lebih mudah untuk data analysis

In [73]:
import pandas as pd

# convert list to string separated by ", "
for course in api_courses:
    course['instructors'] = ", ".join(course['instructors'] if isinstance(course['instructors'], list) else [course['instructors']])
    course['department_name'] = ", ".join(course['department_name'] if isinstance(course['department_name'], list) else [course['department_name']])
    course['topics'] = ", ".join(course['topics'] if isinstance(course['topics'], list) else [course['topics']])
    course['preprocessed_department_name'] = ", ".join(course['preprocessed_department_name'] if isinstance(course['preprocessed_department_name'], list) else [course['preprocessed_department_name']])
    course['level'] = ", ".join(course['level'] if isinstance(course['level'], list) else [course['level']])
    course['preprocessed_topics'] = ", ".join(course['preprocessed_topics'] if isinstance(course['preprocessed_topics'], list) else [course['preprocessed_topics']])

courses_df = pd.DataFrame(api_courses)

In [50]:
display(courses_df)

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name
0,14.44,Energy Economics,Undergraduate,Spring,2007.0,This course explores the theoretical and empir...,"Science, Economics, Social Science, Public Adm...",Prof. Paul Joskow,Economics,course explores theoretical empirical perspect...,energy economics,"science, economics, social science, public adm...",economics
1,21A.101J,Identity and Difference,Undergraduate,Spring,2010.0,"This course explores how identities, whether o...","Social Science, Society, Anthropology, Gender ...",Prof. Jean Jackson,"Anthropology, Women's and Gender Studies",course explores identity whether individual gr...,identity difference,"social science, society, anthropology, gender ...","anthropology, woman gender study"
2,8.06,Quantum Physics III,Undergraduate,Spring,2018.0,This course is a continuation of [*8.05 Quantu...,"Science, Physics, Theoretical Physics, Quantum...",Prof. Barton Zwiebach,Physics,course continuation 805 quantum physic iicours...,quantum physic iii,"science, physic, theoretical physic, quantum m...",physic
3,11.522,Research Seminar on Urban Information Systems,Graduate,Fall,2005.0,Seminar participants and invited guests will l...,"Engineering, Computer Science, Social Science,...",Prof. Joseph Ferreira,Urban Studies and Planning,seminar participant invited guest lead critica...,research seminar urban information system,"engineering, computer science, social science,...",urban study planning
4,7.016,Introductory Biology,Undergraduate,Fall,2018.0,_7.016 Introductory Biology_ provides an intro...,"Science, Health and Medicine, Biology, Genetic...","Prof. Barbara Imperiali, Prof. Adam Martin, Dr...",Biology,_7016 introductory biology_ provides introduct...,introductory biology,"science, health medicine, biology, genetics, m...",biology
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2562,18.S096,Matrix Calculus for Machine Learning and Beyond,Undergraduate,January IAP,2023.0,We all know that calculus courses such as [*18...,"Mathematics, Applied Mathematics, Linear Algeb...","Prof. Alan Edelman, Prof. Steven G. Johnson",Mathematics,know calculus course 1801 single variable calc...,matrix calculus machine learning beyond,"mathematics, applied mathematics, linear algeb...",mathematics
2563,18.404J,Theory of Computation,"Undergraduate, Graduate",Fall,2020.0,This course emphasizes computability and compu...,"Engineering, Computer Science, Mathematics, Co...",Prof. Michael Sipser,"Mathematics, Electrical Engineering and Comput...",course emphasizes computability computational ...,theory computation,"engineering, computer science, mathematics, co...","mathematics, electrical engineering computer s..."
2564,RES.STR-001,Geographic Information System (GIS) Tutorial,Non-Credit,January IAP,2022.0,Learn how to read and interpret maps and data ...,"Social Science, Geography",MIT Libraries GIS Services Group,,learn read interpret map data use basic cartog...,geographic information system gi tutorial,"social science, geography",
2565,RES.ENV-003,EarthDNA's Climate 101,Non-Credit,Fall,2019.0,The Climate 101 presentation was developed by ...,"Science, Earth Science, Environmental Science,...","Brandon Leshchinskiy, Dava Newman",,climate 101 presentation developed brandon les...,earthdnas climate 101,"science, earth science, environmental science,...",


In [51]:
# take only the preprocessed fields

truncated_courses_df = courses_df[['number', 'semester', 'year', 'level', 'preprocessed_title', 'preprocessed_description', 'instructors', 'preprocessed_department_name', 'preprocessed_topics']]

In [None]:
truncated_courses_df.to_csv('courses.csv', index=False)

In [74]:
# see unique values of categorical fields
print("Unique values of level:", truncated_courses_df['level'].unique())
print("Unique values of semester:", truncated_courses_df['semester'].unique())
print("Unique values of year:", truncated_courses_df['year'].unique())

Unique values of level: ['Undergraduate' 'Graduate' 'Undergraduate, Graduate'
 'Graduate, Undergraduate' 'Non-Credit' 'High School'
 'Undergraduate, Graduate, Non-Credit' 'Graduate, Non-Credit']
Unique values of semester: ['Spring' 'Fall' 'January IAP' 'Summer' None]
Unique values of year: [2007. 2010. 2018. 2005. 2006. 2009. 2016. 2011. 2014. 2008. 2017. 2013.
 2012. 2003. 2004. 2019. 2015. 2002. 2001. 1999. 2000. 1998. 2021. 2020.
 2022. 2023. 2024.   nan 1997. 2025.]
