In [58]:
from bs4 import BeautifulSoup

# Load the HTML file
with open('/Users/matthew/Documents/course-recommender/cse_course_catalogue.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')
    

In [59]:
# Function to extract relevant information for each class and quiz section
def extract_course_info():
    courses = []
    
    # Find all the tables containing class information (including quizzes)
    tables = soup.find_all('table')

    for table in tables:
        course_info = {}

        # Check if the table contains course information (blue header)
        course_name_tag = table.find('a', href=True)
        if course_name_tag and table.find('b'):  # Main course sections
            course_info['Course Name'] = course_name_tag.text.strip()

            # Find the next table with class details (SLN, section, times, etc.)
            next_table = table.find_next('table')
            if next_table:
                pre_tag = next_table.find('pre')
                if pre_tag:
                    text = pre_tag.get_text(separator=" ", strip=True).split()

                    # Safely parse the fields, handling index errors for missing data
                    course_info['SLN'] = text[0] if len(text) > 0 else ''
                    course_info['Section'] = text[1] if len(text) > 1 else ''
                    course_info['Credits'] = text[2] if len(text) > 2 else ''
                    course_info['Meeting Times'] = ' '.join(text[3:6]) if len(text) > 5 else ''
                    course_info['Building/Room'] = text[6] + ' ' + text[7] if len(text) > 7 else ''
                    
                    # Fix: Parse Instructor, Status, and Enrollment correctly
                    course_info['Instructor'] = text[8] if len(text) > 8 else ''
                    course_info['Status'] = text[9] if len(text) > 9 else ''
                    course_info['Enrollment'] = ' '.join(text[10:12]) if len(text) > 11 else ''

                    # Check for and remove any leftover additional info
                    additional_info_start = next_table.get_text().find('SEE')
                    if additional_info_start != -1:
                        additional_info_end = next_table.get_text().find('--')
                        course_info['Additional Info'] = next_table.get_text()[additional_info_start:additional_info_end].strip()

            courses.append(course_info)

        # Now handle quiz sections (QZ sections typically following course sections)
        elif table.find('pre') and "QZ" in table.get_text():
            pre_tag = table.find('pre')
            if pre_tag:
                text = pre_tag.get_text(separator=" ", strip=True).split()

                # Correctly map the quiz section fields
                course_info['SLN'] = text[0] if len(text) > 0 else ''
                course_info['Section'] = text[1] + " (Quiz)" if len(text) > 1 else ''
                course_info['Meeting Times'] = ' '.join(text[2:5]) if len(text) > 5 else ''
                course_info['Building/Room'] = text[5] + ' ' + text[6] if len(text) > 6 else ''
                course_info['Status'] = text[7] if len(text) > 7 else ''
                course_info['Enrollment'] = ' '.join(text[8:10]) if len(text) > 9 else ''

                courses.append(course_info)

    return courses

# Extract the course and quiz information and print
course_list = extract_course_info()

for course in course_list:
    print(course)


{'Course Name': 'Enrollment Summary'}
{'Course Name': 'Enrollment Summary', 'SLN': 'Enrl', 'Section': 'Sect', 'Credits': 'Crs', 'Meeting Times': 'Restr SLN ID', 'Building/Room': 'Cred Meeting', 'Instructor': 'Times', 'Status': 'Bldg/Rm', 'Enrollment': 'Instructor Status'}
{'Course Name': '?'}
{'Course Name': 'COMP PROGRAMMING I', 'SLN': '12810', 'Section': 'A', 'Credits': '4', 'Meeting Times': 'WF 1230-120 KNE', 'Building/Room': '120 Wang,Matt', 'Instructor': 'Open', 'Status': '0/', 'Enrollment': '302 --', 'Additional Info': ''}
{'SLN': '12811', 'Section': 'AA (Quiz)', 'Meeting Times': 'QZ TTh 830-920', 'Building/Room': 'SAV 158', 'Status': 'Open', 'Enrollment': '0/ 21'}
{'SLN': '12812', 'Section': 'AB (Quiz)', 'Meeting Times': 'QZ TTh 930-1020', 'Building/Room': 'CDH 115', 'Status': 'Open', 'Enrollment': '0/ 21'}
{'SLN': '12813', 'Section': 'AC (Quiz)', 'Meeting Times': 'QZ TTh 1030-1120', 'Building/Room': 'LOW 201', 'Status': 'Open', 'Enrollment': '0/ 21'}
{'SLN': '12814', 'Section':

In [60]:
import json

# Save the course data as a JSON file
with open('course_catalog.json', 'w', encoding='utf-8') as json_file:
    json.dump(course_list, json_file, indent=4)