In [99]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import csv

In [104]:
cookies_raw = json.load(open('academicaffairs.ucsd.edu.cookies.json', 'r'))

In [105]:
cookies = {cookie['name']: cookie['value'] for cookie in cookies_raw}

In [109]:
driver = webdriver.Chrome()

# Load the webpage where you want to use the cookies
driver.get("http://academicaffairs.ucsd.edu/")

# Load cookies from the JSON file
with open('academicaffairs.ucsd.edu.cookies.json', 'r') as file:
    cookies_raw = json.load(file)

# Add each cookie to the Selenium
for cookie in cookies_raw:
    driver.add_cookie(cookie)

# Refresh the page to apply the cookies
driver.refresh()

# Get SET Page
driver.get("http://academicaffairs.ucsd.edu/Modules/Evals/SET/Reports/Search.aspx")

time.sleep(5)

# Find the unit dropdown and collect all unit options
unit_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlUnit")
unit_options = unit_dropdown.find_elements(By.TAG_NAME, "option")
units = [(option.text, option.get_attribute("value")) for option in unit_options if option.get_attribute("value")]
with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Instructor', 'Course', 'Term', 'Enrolled/Resp Rate', 'Avg Grade Received', 'Avg Hours Worked', 'Student Learning', 'Course Structure', 'Class Environment'])
    
    for unit in units:
        # Select a unit
        unit_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlUnit")
        for option in unit_dropdown.find_elements(By.TAG_NAME, "option"):
            if option.get_attribute("value") == unit[1]:
                option.click()
                break
    
        # Wait for the course dropdown to be populated
        time.sleep(2)
        course_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlCourse")
        course_options = course_dropdown.find_elements(By.TAG_NAME, "option")
    
        # Extract course information
        courses = [(option.text, option.get_attribute("value")) for option in course_options if option.get_attribute("value")]
    
        for course in courses:
            # Select a course
            course_dropdown = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_ddlCourse")
            for option in course_dropdown.find_elements(By.TAG_NAME, "option"):
                if option.get_attribute("value") == course[1]:
                    option.click()
                    break
    
            # Click the search button
            search_button = driver.find_element(By.ID, "ContentPlaceHolder1_EvalsContentPlaceHolder_btnSubmit")
            search_button.click()
    
            # Wait for the table to load
            time.sleep(5)
    
            # Scrape the table data using BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # Scrape the SET Teaching Results
            table = soup.find("table")
        
            # Extract table rows
            rows = table.find('tbody').find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                row_data = [cell.text.strip() for cell in cells]
                writer.writerow([unit[0], course[0]] + row_data)
            

# Close the browser
driver.quit()


AttributeError: 'NoneType' object has no attribute 'find'

In [1]:
import pandas as pd

In [15]:
scraped_data = pd.read_csv('scraped_data.csv')

## Cleaning Data &#x1F6AE;
After scraping the data, there are some things to clean.
#### Todo List
- Fix types
- Column Names
    - Implement naming conventions.
- Avg Grade Received Column
    - Remove trailing white space and commas
- Course Column and Indexes
    - Remove redundancy
    - Group by instructor, course and term
- Enrolled/Resp Rate
    - Separate Columns
- Term Column
    - Make sure terms start from Summer 2023 (when SET was introduced)

In [17]:
scraped_data.head()

Unnamed: 0,Unnamed: 1,Instructor,Course,Term,Enrolled/Resp Rate,Avg Grade Received,Avg Hours Worked,Student Learning,Course Structure,Class Environment
Analytical Writing Program,AWP 3 - Analytical Writing,"Given, William Allan",AWP 3 - Analytical Writing (002),SP24,10(70.00%),2.37 ...,7.29,4.36,4.43,4.54
Analytical Writing Program,AWP 3 - Analytical Writing,"Kolodezh, Samuel",AWP 3 - Analytical Writing (009),SP24,7(42.86%),1.86 ...,8.33,4.67,4.58,4.42
Analytical Writing Program,AWP 3 - Analytical Writing,"Gilbert, Peter W",AWP 3 - Analytical Writing (010),SP24,7(57.14%),1.81 ...,8.0,5.0,5.0,4.88
Analytical Writing Program,AWP 3 - Analytical Writing,"Gilbert, Peter W",AWP 3 - Analytical Writing (003),WI24,7(71.43%),2.77 ...,8.6,4.2,3.9,4.3
Analytical Writing Program,AWP 3 - Analytical Writing,"Gilbert, Peter W",AWP 3 - Analytical Writing (004),WI24,8(100.00%),3.13 ...,6.25,4.41,4.47,4.47


In [19]:
scraped_data.columns

Index(['Instructor', 'Course', 'Term', 'Enrolled/Resp Rate',
       'Avg Grade Received', 'Avg Hours Worked', 'Student Learning',
       'Course Structure', 'Class Environment'],
      dtype='object')

In [93]:
scraped_data.dtypes

Instructor             object
Course                 object
Term                   object
Enrolled/Resp Rate     object
Avg Grade Received     object
Avg Hours Worked      float64
Student Learning      float64
Course Structure      float64
Class Environment     float64
dtype: object

In [89]:
df['avg_grade']

0      2.37                                          ...
1      1.86                                          ...
2      1.81                                          ...
3      2.77                                          ...
4      3.13                                          ...
                             ...                        
125                                                (N/A)
126                                                (N/A)
127                                                (N/A)
128                                                (N/A)
129                                                (N/A)
Name: avg_grade, Length: 130, dtype: object

In [53]:
import re

In [98]:
df = scraped_data.reset_index()

# Separate Enrolled/Resp Rate Column
sep = df['Enrolled/Resp Rate'].str.extract(r'(\d+)\((\d+\.\d+)%\)')
df['enrolled'] = sep[0].astype(int)
df['resp_rate'] = sep[1].astype(float)
df['resp_rate'] *= 0.01

# Drop irrelevant columns
df = df.drop(['level_0', 'level_1', 'Enrolled/Resp Rate'], axis = 1)

# Column Names
new_columns = ['instructor', 'course', 'term', 'avg_grade', 'avg_hours', 'student_learning_rating', 'course_structure_rating', 'class_environment_rating', 'num_enrolled', 'resp_rate']
df.columns = new_columns

# Clean course column
df['course'] = df['course'].str.replace(r"\ \(\d{3}\)", "", regex=True)

# Clean avg_grade columns
df['avg_grade'] = df['avg_grade'].str.extract(r"(\d\.\d+)")
df['avg_grade'] = df['avg_grade'].astype('float')


df.head()


Unnamed: 0,instructor,course,term,avg_grade,avg_hours,student_learning_rating,course_structure_rating,class_environment_rating,num_enrolled,resp_rate
0,"Given, William Allan",AWP 3 - Analytical Writing,SP24,2.37,7.29,4.36,4.43,4.54,10,0.7
1,"Kolodezh, Samuel",AWP 3 - Analytical Writing,SP24,1.86,8.33,4.67,4.58,4.42,7,0.4286
2,"Gilbert, Peter W",AWP 3 - Analytical Writing,SP24,1.81,8.0,5.0,5.0,4.88,7,0.5714
3,"Gilbert, Peter W",AWP 3 - Analytical Writing,WI24,2.77,8.6,4.2,3.9,4.3,7,0.7143
4,"Gilbert, Peter W",AWP 3 - Analytical Writing,WI24,3.13,6.25,4.41,4.47,4.47,8,1.0


In [97]:
df.dtypes

instructor                   object
course                       object
term                         object
avg_grade                   float64
avg_hours                   float64
student_learning_rating     float64
course_structure_rating     float64
class_environment_rating    float64
num_enrolled                  int64
resp_rate                   float64
dtype: object

In [108]:
df['course'].value_counts()

course
AWP 3 - Analytical Writing       89
AWP 4A - Analytical Writing A    41
Name: count, dtype: int64