In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import csv

In [76]:
driver = webdriver.Chrome()

# Load the webpage where you want to use the cookies
driver.get("http://academicaffairs.ucsd.edu/")

# Load cookies from the JSON file
with open('academicaffairs.ucsd.edu.cookies.json', 'r') as file:
    cookies_raw = json.load(file)

# Add each cookie to the Selenium
for cookie in cookies_raw:
    driver.add_cookie(cookie)

# Refresh the page to apply the cookies
driver.refresh()

# Get SET Page
driver.get("http://academicaffairs.ucsd.edu/Modules/Evals/SET/Reports/Search.aspx")

time.sleep(5)

# Find the unit dropdown and collect all unit options
unit_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlUnit")
unit_options = unit_dropdown.find_elements(By.TAG_NAME, "option")
units = [(option.text, option.get_attribute("value")) for option in unit_options if option.get_attribute("value")]
with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Instructor', 'Course', 'Term', 'Enrolled/Resp Rate', 'Avg Grade Received', 'Avg Hours Worked', 'Student Learning', 'Course Structure', 'Class Environment'])

    # Start at a course / unit
    no_start = False
    start_unit = "Communication"
    start_course = "COMM 196B - Honors Seminar II: Research"
    found_start_unit = False
    found_start_course = False


    for unit in units:
        # Skip courses until the start unit is found
        if not found_start_unit and not no_start:
            if unit[0] == start_unit:
                found_start_unit = True
            else:
                continue
        # Select a unit
        unit_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlUnit")
        for option in unit_dropdown.find_elements(By.TAG_NAME, "option"):
            if option.get_attribute("value") == unit[1]:
                option.click()
                break
    
        # Wait for the course dropdown to be populated
        time.sleep(2)
        course_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlCourse")
        course_options = course_dropdown.find_elements(By.TAG_NAME, "option")
    
        # Extract course information
        courses = [(option.text, option.get_attribute("value")) for option in course_options if option.get_attribute("value")]
    
        for course in courses:
            # Skip courses until the start course is found
            if not found_start_course and not no_start:
                if course[0] == start_course:
                    found_start_course = True
                else:
                    continue

            # Select a course
            course_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlCourse")
            for option in course_dropdown.find_elements(By.TAG_NAME, "option"):
                if option.get_attribute("value") == course[1]:
                    option.click()
                    break
    
            # Click the search button
            search_button = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_btnSubmit")
            search_button.click()
    
            # Wait for the table to load
            time.sleep(5)
    
            # Scrape the table data using BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # Try to find a table on the page
            try:
                # Scrape the SET Teaching Results
                table = soup.find("table")
            
                # Extract table rows
                rows = table.find('tbody').find_all('tr')
                for row in rows:
                    cells = row.find_all('td')
                    row_data = [cell.text.strip() for cell in cells]
                    writer.writerow([unit[0], course[0]] + row_data)
            except:
                continue

# Close the browser
driver.quit()


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="ContentPlaceHolder1_EvalsContentPlaceHolder_ddlCourse"]"}
  (Session info: chrome=126.0.6478.127)
Stacktrace:
	GetHandleVerifier [0x00007FF734CB22C2+60002]
	(No symbol) [0x00007FF734C2CA59]
	(No symbol) [0x00007FF734AE7EDA]
	(No symbol) [0x00007FF734B376E6]
	(No symbol) [0x00007FF734B377AC]
	(No symbol) [0x00007FF734B7E9D7]
	(No symbol) [0x00007FF734B5C2CF]
	(No symbol) [0x00007FF734B7BCC7]
	(No symbol) [0x00007FF734B5C033]
	(No symbol) [0x00007FF734B29657]
	(No symbol) [0x00007FF734B2A251]
	GetHandleVerifier [0x00007FF734FC3E2D+3278285]
	GetHandleVerifier [0x00007FF735010190+3590448]
	GetHandleVerifier [0x00007FF7350061D0+3549552]
	GetHandleVerifier [0x00007FF734D61DE6+779654]
	(No symbol) [0x00007FF734C37ACF]
	(No symbol) [0x00007FF734C32EE4]
	(No symbol) [0x00007FF734C33072]
	(No symbol) [0x00007FF734C22C4F]
	BaseThreadInitThunk [0x00007FF9B2247344+20]
	RtlUserThreadStart [0x00007FF9B3ADCC91+33]


In [4]:
import pandas as pd
import re

In [77]:
scraped_data = pd.read_csv('scraped_data.csv')

## Cleaning Data &#x1F6AE;
After scraping the data, there are some things to clean.
#### Todo List
- Fix types
- Handle null values
- Column Names
    - Implement naming conventions.
- Avg Grade Received Column
    - Remove trailing white space and commas
- Course Column and Indexes
    - Remove redundancy
    - Group by instructor, course and term
- Enrolled/Resp Rate
    - Separate Columns
- Term Column
    - Make sure terms start from Summer 2023 (when SET was introduced)
 

In [78]:
scraped_data

Unnamed: 0,Unnamed: 1,Instructor,Course,Term,Enrolled/Resp Rate,Avg Grade Received,Avg Hours Worked,Student Learning,Course Structure,Class Environment
Communication,COMM 196B - Honors Seminar II: Research,"Dewaard, Andrew Michael",COMM 196B - Honors Seminar II: Research (A00),WI24,8(62.50%),4.00 ...,9.00,5.00,4.94,4.95
Computational Social Sciences,CSS 1 - Prog Computational Social Sci,"Mignozzetti, Umberto",CSS 1 - Prog Computational Social Sci (A00),WI24,109(77.98%),3.68 ...,4.43,4.43,4.44,4.47
Computational Social Sciences,CSS 1 - Prog Computational Social Sci,"Trott, Sean Thomas",CSS 1 - Prog Computational Social Sci (A00),FA23,118(38.14%),3.72 ...,6.22,4.41,4.31,4.55
Computational Social Sciences,CSS 1 - Prog Computational Social Sci,"Mignozzetti, Umberto",CSS 1 - Prog Computational Social Sci (B00),FA23,66(75.76%),3.64 ...,5.50,4.40,4.38,4.50
Computational Social Sciences,CSS 2 - Data/Model Programming: CSS,"Trott, Sean Thomas",CSS 2 - Data/Model Programming: CSS (A00),WI24,77(33.77%),3.82 ...,4.12,4.67,4.67,4.75
...,...,...,...,...,...,...,...,...,...,...
History,HIEU 176 - Politics in the Jewish Past,"Hertz, Deborah",HIEU 176 - Politics in the Jewish Past (A00),SP24,14(57.14%),3.79 ...,3.75,4.66,4.50,4.69
History,HIEU 183 - Social Hist/Mediterranean,"Gallant, Thomas W.",HIEU 183 - Social Hist/Mediterranean (A00),SP24,23(39.13%),3.79 ...,4.33,4.54,4.37,4.57
History,HIGL 127 - Sport in the Modern World,"Ivey, James Alexander",HIGL 127 - Sport in the Modern World (A00),SP24,36(33.33%),3.87 ...,4.00,4.73,4.71,4.73
History,HILA 100 - Conquest/Empire: The Americas,"Murillo, Dana V.",HILA 100 - Conquest/Empire: The Americas (A00),FA23,46(30.43%),2.98 ...,6.43,4.52,4.55,4.73


In [79]:
scraped_data.columns

Index(['Instructor', 'Course', 'Term', 'Enrolled/Resp Rate',
       'Avg Grade Received', 'Avg Hours Worked', 'Student Learning',
       'Course Structure', 'Class Environment'],
      dtype='object')

In [80]:
scraped_data.dtypes

Instructor             object
Course                 object
Term                   object
Enrolled/Resp Rate     object
Avg Grade Received     object
Avg Hours Worked       object
Student Learning      float64
Course Structure      float64
Class Environment     float64
dtype: object

In [81]:
scraped_data

Unnamed: 0,Unnamed: 1,Instructor,Course,Term,Enrolled/Resp Rate,Avg Grade Received,Avg Hours Worked,Student Learning,Course Structure,Class Environment
Communication,COMM 196B - Honors Seminar II: Research,"Dewaard, Andrew Michael",COMM 196B - Honors Seminar II: Research (A00),WI24,8(62.50%),4.00 ...,9.00,5.00,4.94,4.95
Computational Social Sciences,CSS 1 - Prog Computational Social Sci,"Mignozzetti, Umberto",CSS 1 - Prog Computational Social Sci (A00),WI24,109(77.98%),3.68 ...,4.43,4.43,4.44,4.47
Computational Social Sciences,CSS 1 - Prog Computational Social Sci,"Trott, Sean Thomas",CSS 1 - Prog Computational Social Sci (A00),FA23,118(38.14%),3.72 ...,6.22,4.41,4.31,4.55
Computational Social Sciences,CSS 1 - Prog Computational Social Sci,"Mignozzetti, Umberto",CSS 1 - Prog Computational Social Sci (B00),FA23,66(75.76%),3.64 ...,5.50,4.40,4.38,4.50
Computational Social Sciences,CSS 2 - Data/Model Programming: CSS,"Trott, Sean Thomas",CSS 2 - Data/Model Programming: CSS (A00),WI24,77(33.77%),3.82 ...,4.12,4.67,4.67,4.75
...,...,...,...,...,...,...,...,...,...,...
History,HIEU 176 - Politics in the Jewish Past,"Hertz, Deborah",HIEU 176 - Politics in the Jewish Past (A00),SP24,14(57.14%),3.79 ...,3.75,4.66,4.50,4.69
History,HIEU 183 - Social Hist/Mediterranean,"Gallant, Thomas W.",HIEU 183 - Social Hist/Mediterranean (A00),SP24,23(39.13%),3.79 ...,4.33,4.54,4.37,4.57
History,HIGL 127 - Sport in the Modern World,"Ivey, James Alexander",HIGL 127 - Sport in the Modern World (A00),SP24,36(33.33%),3.87 ...,4.00,4.73,4.71,4.73
History,HILA 100 - Conquest/Empire: The Americas,"Murillo, Dana V.",HILA 100 - Conquest/Empire: The Americas (A00),FA23,46(30.43%),2.98 ...,6.43,4.52,4.55,4.73


In [82]:
scraped_data[[col for col in scraped_data.columns]].isnull().sum()

Instructor              0
Course                  0
Term                    0
Enrolled/Resp Rate      6
Avg Grade Received      0
Avg Hours Worked      149
Student Learning        4
Course Structure      200
Class Environment     200
dtype: int64

In [83]:
df = scraped_data.reset_index()

# Handle nulls
df = df.fillna(-1)

# Separate Enrolled/Resp Rate Column
sep = df['Enrolled/Resp Rate'].str.extract(r'(\d+)\((\d+\.\d+)%\)')
sep = sep.fillna(-1)
df['enrolled'] = sep[0].astype(int)
df['resp_rate'] = sep[1].astype(float)
df['resp_rate'] *= 0.01

# Drop irrelevant columns
df = df.drop(['level_0', 'level_1', 'Enrolled/Resp Rate'], axis = 1)

# Column Names
new_columns = ['instructor', 'course', 'term', 'avg_grade', 'avg_hours', 'student_learning_rating', 'course_structure_rating', 'class_environment_rating', 'num_enrolled', 'resp_rate']
df.columns = new_columns

# Clean course column
df['course'] = df['course'].str.replace(r"\ \(\d{3}\)", "", regex=True)

# Clean avg_grade columns
df['avg_grade'] = df['avg_grade'].str.extract(r"(\d\.\d+)")
df['avg_grade'] = df['avg_grade'].astype('float')


df.head()


Unnamed: 0,instructor,course,term,avg_grade,avg_hours,student_learning_rating,course_structure_rating,class_environment_rating,num_enrolled,resp_rate
0,"Dewaard, Andrew Michael",COMM 196B - Honors Seminar II: Research (A00),WI24,4.0,9.0,5.0,4.94,4.95,8,0.625
1,"Mignozzetti, Umberto",CSS 1 - Prog Computational Social Sci (A00),WI24,3.68,4.43,4.43,4.44,4.47,109,0.7798
2,"Trott, Sean Thomas",CSS 1 - Prog Computational Social Sci (A00),FA23,3.72,6.22,4.41,4.31,4.55,118,0.3814
3,"Mignozzetti, Umberto",CSS 1 - Prog Computational Social Sci (B00),FA23,3.64,5.5,4.4,4.38,4.5,66,0.7576
4,"Trott, Sean Thomas",CSS 2 - Data/Model Programming: CSS (A00),WI24,3.82,4.12,4.67,4.67,4.75,77,0.3377


In [84]:
df.dtypes

instructor                   object
course                       object
term                         object
avg_grade                   float64
avg_hours                    object
student_learning_rating     float64
course_structure_rating     float64
class_environment_rating    float64
num_enrolled                  int64
resp_rate                   float64
dtype: object

In [85]:
df['course'].value_counts()

course
CAT 125 - Public Rhetoric & Prac Comm             34
ECE 148 - Intro to Autonomous Vehicles (A)        19
EDS 31 - Introductn to Teaching Science (A)       16
CSE 191 - Semnr/Computer Sci & Engineer (B)       15
CSE 191 - Semnr/Computer Sci & Engineer (A)       15
                                                  ..
HIEA 114 - Postwar Japan (A00)                     1
HIEA 180 - Topics/Modern Korean History (A00)      1
GLBH 113 - Women's Health/Global Persp (A00)       1
GLBH 114 - Latin American Health&Healing (A00)     1
CSS 100 - Advanced Programming: CSS (A00)          1
Name: count, Length: 657, dtype: int64

In [86]:
df.to_csv("checkpoint_3.csv")