# HarvardX Online Courses

General information on free online courses from Harvard on edX.org

In [None]:
! pip install selenium webdriver_manager

In [36]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from time import sleep
import os
import logging


from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [40]:
logging.getLogger('WDM').setLevel(logging.NOTSET)
os.environ['WDM_LOG'] = 'False'


# Driver set-up
manager = ChromeDriverManager().install().replace("THIRD_PARTY_NOTICES.", '')
os.chmod(manager, 755)
service = Service(manager)
driver = webdriver.Chrome(service=service)

In [57]:
def get_course_info(course_url, course_name):
  r = requests.get(course_url)
  page = BeautifulSoup(r.text, 'html.parser')
  card = page.find(class_="at-a-glance")
  card = card.find_all('li') if card else []
  card_items = dict([c.text.split(':', 1) for c in card if ':' in c.text])
  fields = ['Subject', 'Level', 'Prerequisites', 'Language', 'Associated skills']
  relevant_fields = dict([(f, card_items.get(f, '').strip() if not 'None' in card_items.get(f, []) else None) for f in fields])
  
  data = dict()
  data['Name'] = course_name
  data['Duration'] = page.find(class_="h4 mb-0").text

  data['Rating'] = page.find(class_="h5 ml-1 mr-3 mb-0")
  data['Rating'] = data['Rating'].text.split()[0] if data['Rating'] else None
  data['Rating'] = float(data['Rating']) if data['Rating'] and not data['Rating'].isalpha() else None

  data['Number of ratings'] = page.find(class_="micro")
  data['Number of ratings'] = data['Number of ratings'].text.split()[0] if data['Number of ratings'] else None
  data['Number of ratings'] = int(data['Number of ratings']) if data['Number of ratings'] and data['Number of ratings'].isdigit() else None
  
  data['Total enrolled students'] = page.find(attrs={"data-test-id": "selector-subheading"})
  data['Total enrolled students'] = data['Total enrolled students'].find(class_="small") if data['Total enrolled students'] else None
  data['Total enrolled students'] = data['Total enrolled students'].text.split()[0] if data['Total enrolled students'] else None
  data['Total enrolled students'] = int(data['Total enrolled students'].replace(',', '')) if data['Total enrolled students'] and not data['Total enrolled students'].isalpha() else None

  data.update(relevant_fields)
  return data


DATASET = []


def get_all_courses(courses_page_url):
  driver.get(courses_page_url)
  try:
    driver.find_element(By.XPATH, ".//button[contains(text(), 'Show')]").click()
  except:
    pass
  container = driver.find_element(By.XPATH, './/*[@data-testid="card-container"]')
  course_cards = container.find_elements(By.XPATH, './/*[@class="base-card-wrapper"]')

  for card in course_cards:
    try:
      url = card.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
      name = card.find_element(By.XPATH, './/*[@class="pgn__card-header-title-md"]').text.replace('\n', ' ')
      info = get_course_info(url, name)
      DATASET.append(info)
      print(name)
      sleep(3)
    except:
      continue

In [58]:
get_all_courses("https://www.edx.org/school/harvardx#programs")

CS50's Introduction to Computer Science
CS50's Introduction to Programming with Python
CS50's Introduction to Artificial Intelligence with Python
CS50's Web Programming with Python and JavaScript
Exercising Leadership: Foundational Principles
CS50's Introduction to Cybersecurity
Rhetoric: The Art of Persuasive Writing and Public Speaking
Building Personal Resilience: Managing Anxiety and Mental He…
Human Anatomy: Musculoskeletal Cases
Data Science: R Basics
Data Science: Machine Learning
CS50's Introduction to Databases with SQL
CS50's Introduction to Programming with Scratch
Remote Work Revolution for Everyone
Contract Law: From Trust to Promise to Contract
Managing Happiness
Entrepreneurship in Emerging Economies
Data Science: Visualization
Fundamentals of Neuroscience, Part 1: The Electrical Properti…
Machine Learning and AI with Python
Justice
The Architectural Imagination
Data Science: Probability
Calculus Applied!
Technology Entrepreneurship: Lab to Market
CS50's Computer Science

In [59]:
DF = pd.DataFrame(DATASET)
DF

Unnamed: 0,Name,Duration,Rating,Number of ratings,Total enrolled students,Subject,Level,Prerequisites,Language,Associated skills
0,CS50's Introduction to Computer Science,12 weeks,,,6163497.0,Computer Science,Introductory,,English,"Data Structures, C (Programming Language), Sec..."
1,CS50's Introduction to Programming with Python,10 weeks,,,1121446.0,Computer Science,Introductory,,English,"Data Science, C (Programming Language), Web Br..."
2,CS50's Introduction to Artificial Intelligence...,7 weeks,,,1160141.0,Computer Science,Introductory,CS50 or prior programming experience in Python.,English,"Handwriting Recognition, Mathematical Optimiza..."
3,CS50's Web Programming with Python and JavaScript,12 weeks,,,1499184.0,Computer Science,Intermediate,CS50 or prior programming experience in any la...,English,"Scalability, Cloud Services, React.js, Github,..."
4,Exercising Leadership: Foundational Principles,4 weeks,4.6,960.0,636807.0,Business & Management,Introductory,,English,Leadership
...,...,...,...,...,...,...,...,...,...,...
151,"The Book: Monasteries, Schools, and Notaries, ...",3 weeks,,,,Humanities,Introductory,,English,Notary Services
152,The Book: Making and Meaning in the Medieval M...,8 weeks,,,,Humanities,Introductory,,English,"Collections, Production Process"
153,The Book: The Medieval Book of Hours: Art and ...,4 weeks,,,,Literature,Introductory,,English,"Literacy, Painting"
154,The Book: The History of the Book in the 17th ...,1 weeks,,,,Humanities,Introductory,,English,


In [64]:
DF.to_csv("harvardx.csv")

In [65]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Name                     156 non-null    object 
 1   Duration                 156 non-null    object 
 2   Rating                   113 non-null    float64
 3   Number of ratings        113 non-null    float64
 4   Total enrolled students  125 non-null    float64
 5   Subject                  156 non-null    object 
 6   Level                    156 non-null    object 
 7   Prerequisites            34 non-null     object 
 8   Language                 156 non-null    object 
 9   Associated skills        156 non-null    object 
dtypes: float64(3), object(7)
memory usage: 12.3+ KB
