In [71]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
from tqdm import tqdm
import re

In [72]:
BASE_URL = "https://catalog.unc.edu"
COURSE_INDEX_URL = f"{BASE_URL}/courses/#text"

def get_department_links(only=None):
    """Scrape all department links from the main courses page."""
    response = requests.get(COURSE_INDEX_URL)
    soup = BeautifulSoup(response.text, "html.parser")
    index_div = soup.find("div", {"id": "atozindex"})
    links = []

    for a in index_div.find_all("a", href=True):
        dept_code = a['href'].split("/")[-2].upper()
        if only is None or dept_code in only:
            links.append((dept_code, urljoin(BASE_URL, a['href'])))

    return links

In [73]:
# Matches course codes like COMP 110, MATH 231, BIOL 101L, 129P, etc.
COURSE_RE = re.compile(r'\b[A-Z]{2,5}\s?\d{2,3}[A-Z]?\d?[A-Z]?\b')

def parse_requisites(raw: str):
    """
    Parse a raw 'Requisites:' string into structured JSON.
    
    Returns:
        dict with:
        - prerequisites: list of AND-clauses (each an OR-list of courses)
        - corequisites: list of AND-clauses (each an OR-list of courses)
        - grade_requirements: dict mapping course to required grade
        - requisites_note: additional notes (permission, etc.)
    """
    result = {
        "prerequisites": [],
        "corequisites": [],
        "grade_requirements": {},
        "requisites_note": None
    }
    
    if not raw:
        return result
    
    # Clean up the raw text
    text = raw.replace("Requisites:", "").strip()
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    
    # Extract grade requirements first
    grade_pattern = r'[A-Z][-+]?\s+or\s+better'
    grade_matches = re.finditer(f'({grade_pattern})', text, re.IGNORECASE)
    
    for match in grade_matches:
        grade_text = match.group(0)
        grade = grade_text.split()[0]  # Extract just the grade (e.g., "C", "C+")
        
        # Look for courses mentioned before this grade requirement
        prefix = text[:match.start()]
        # Find courses in the last clause before the grade
        last_clause_match = re.search(r'(?:^|[;.])\s*([^;.]*?)$', prefix)
        if last_clause_match:
            clause_text = last_clause_match.group(1)
            courses = COURSE_RE.findall(clause_text)
            for course in courses:
                result["grade_requirements"][course.replace(' ', '')] = grade
    
    # Now parse the structure
    # Split by major separators (semicolons and periods)
    clauses = re.split(r'[;.]', text)
    
    notes = []
    
    for clause in clauses:
        clause = clause.strip()
        if not clause:
            continue
            
        # Identify the type of requisite
        is_prereq = False
        is_coreq = False
        is_pre_or_co = False
        
        if re.search(r'\bPre-?\s*or\s+co-?requisites?\b', clause, re.IGNORECASE):
            is_pre_or_co = True
        elif re.search(r'\bPrerequisites?\b', clause, re.IGNORECASE):
            is_prereq = True
        elif re.search(r'\bCo-?requisites?\b', clause, re.IGNORECASE):
            is_coreq = True
        else:
            # If no explicit label, check context
            if any(keyword in clause.lower() for keyword in ['permission', 'instructor', 'seniors', 'graduate']):
                notes.append(clause)
                continue
            # Default to prerequisite if courses are mentioned
            elif COURSE_RE.search(clause):
                is_prereq = True
        
        # Extract courses from this clause
        and_groups = extract_course_groups(clause)
        
        if and_groups:
            if is_pre_or_co:
                result["prerequisites"].extend(and_groups)
                result["corequisites"].extend(and_groups)
            elif is_prereq:
                result["prerequisites"].extend(and_groups)
            elif is_coreq:
                result["corequisites"].extend(and_groups)
        
        # Check for permission/other notes
        if re.search(r'\bpermission\b|\binstructor\b|\bseniors\b|\bgraduate\b', clause, re.IGNORECASE):
            # Remove course codes to get just the note
            note_text = clause
            for course in COURSE_RE.findall(clause):
                note_text = note_text.replace(course, '')
            note_text = re.sub(r'\s+', ' ', note_text).strip()
            if note_text and not note_text.isspace():
                notes.append(note_text)
    
    # Clean up and combine notes
    if notes:
        # Remove duplicate notes and clean them up
        clean_notes = []
        for note in notes:
            # Remove requisite labels from notes
            note = re.sub(r'\b(?:Pre-?\s*or\s+co-?requisites?|Prerequisites?|Co-?requisites?)\b[:,]?\s*', '', note, flags=re.IGNORECASE)
            # Remove grade requirements that are already captured
            note = re.sub(f'{grade_pattern}(?:\s+in\s+(?:all|both)\s+prerequisite\s+courses?)?', '', note, flags=re.IGNORECASE)
            note = re.sub(r'[;,]\s*(?:and\s+)?a?\s+grade\s+of\s+', '', note, flags=re.IGNORECASE)
            note = re.sub(r'a\s+grade\s+of\s+(?:is\s+required)?', '', note, flags=re.IGNORECASE)
            note = note.strip(' ;,.')
            if note and not note.isspace():
                clean_notes.append(note)
        
        # Deduplicate and join
        clean_notes = list(dict.fromkeys(clean_notes))
        if clean_notes:
            result["requisites_note"] = "; ".join(clean_notes)
    
    return result


def extract_course_groups(text):
    """
    Extract AND-groups of OR-lists of courses from a text clause.
    Returns list of lists, where each inner list is an OR group.
    """
    groups = []
    
    # Special case: "one of the following" means all courses form one OR group
    if "one of the following" in text.lower():
        courses = COURSE_RE.findall(text)
        if courses:
            # Normalize course codes (ensure space between dept and number)
            normalized = []
            for course in courses:
                if not ' ' in course:
                    # Insert space between letters and numbers
                    course = re.sub(r'([A-Z]+)(\d)', r'\1 \2', course)
                normalized.append(course)
            groups.append(normalized)
        return groups
    
    # Handle parenthetical groups first
    paren_pattern = r'\(([^)]+)\)'
    paren_matches = list(re.finditer(paren_pattern, text))
    
    # Process parenthetical groups
    for match in paren_matches:
        paren_content = match.group(1)
        if ' or ' in paren_content.lower():
            courses = COURSE_RE.findall(paren_content)
            if courses:
                normalized = []
                for course in courses:
                    if not ' ' in course:
                        course = re.sub(r'([A-Z]+)(\d)', r'\1 \2', course)
                    normalized.append(course)
                groups.append(normalized)
    
    # Remove parenthetical content for further processing
    text_no_parens = re.sub(paren_pattern, '', text)
    
    # Split by "and" for AND-groups
    and_parts = re.split(r'\s+and\s+', text_no_parens, flags=re.IGNORECASE)
    
    for part in and_parts:
        # Skip if this part was already handled as parenthetical
        if not COURSE_RE.search(part):
            continue
            
        # Check for OR relationships
        if ' or ' in part.lower():
            or_parts = re.split(r'\s+or\s+', part, flags=re.IGNORECASE)
            or_courses = []
            for or_part in or_parts:
                courses = COURSE_RE.findall(or_part)
                for course in courses:
                    if not ' ' in course:
                        course = re.sub(r'([A-Z]+)(\d)', r'\1 \2', course)
                    or_courses.append(course)
            if or_courses:
                groups.append(or_courses)
        else:
            # Single course or multiple courses that are all required
            courses = COURSE_RE.findall(part)
            for course in courses:
                if not ' ' in course:
                    course = re.sub(r'([A-Z]+)(\d)', r'\1 \2', course)
                groups.append([course])  # Single course = OR group of one
    
    return groups

  note = re.sub(f'{grade_pattern}(?:\s+in\s+(?:all|both)\s+prerequisite\s+courses?)?', '', note, flags=re.IGNORECASE)


In [74]:
def parse_course_block(block):
    """
    Parse a course block from the HTML, using the improved requisite parser.
    """
    data = {
        "department": None,
        "course_number": None,
        "course_name": None,
        "credits": None,
        "description": None,
        "requisites": {"prerequisites": [], "corequisites": []},
        "grade_requirements": {},
        "requisites_note": None,
        "gen_ed": None,
        "grading_status": None
    }

    # Header line
    header = block.find("div", class_="cols noindent")
    if header:
        strong_tags = header.find_all("strong")
        if len(strong_tags) >= 3:
            code = strong_tags[0].text.strip()
            if " " in code:
                data["department"], data["course_number"] = code.split(" ", 1)
                data["course_number"] = data["course_number"].rstrip(".")
            data["course_id"] = f"{data['department']} {data['course_number']}"
            data["course_name"] = strong_tags[1].text.strip()
            data["credits"] = strong_tags[2].text.strip().replace(" Credits.", "")

    # Description
    desc_block = block.find("p", class_="courseblockextra")
    if desc_block:
        data["description"] = desc_block.text.strip()

    # Requisites
    req_span = block.find("span", class_="text detail-requisites margin--default")
    if req_span:
        req_data = parse_requisites(req_span.text)
        data["requisites"] = {
            "prerequisites": req_data["prerequisites"],
            "corequisites": req_data["corequisites"]
        }
        data["grade_requirements"] = req_data["grade_requirements"]
        data["requisites_note"] = req_data["requisites_note"]

    # Gen Ed
    idea_span = block.find("span", class_="text detail-idea_action margin--default")
    if idea_span:
        idea_text = idea_span.text.strip().replace("IDEAs in Action Gen Ed:", "")
        data["gen_ed"] = idea_text.strip()

    # Grading
    grading_span = block.find("span", class_="text detail-grading_status margin--default")
    if grading_span:
        data["grading_status"] = grading_span.text.strip().replace("Grading Status: ", "")

    return data

In [75]:
def parse_department(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    course_blocks = soup.find_all("div", class_="courseblock")
    return [parse_course_block(cb) for cb in course_blocks]

In [76]:
def scrape_all_courses(only=None):
    department_links = get_department_links(only=only)
    all_courses = {}

    for dept_code, url in tqdm(department_links, desc="Scraping departments"):
        try:
            courses = parse_department(url)
            all_courses[dept_code] = courses
        except Exception as e:
            print(f"❌ Error scraping {dept_code}: {e}")

    return all_courses

def save_to_json(data, filename="unc_courses.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

In [77]:
sample_departments = {"CHEM", "BIOL", "COMP"}
courses = scrape_all_courses(only=sample_departments)
save_to_json(courses, "unc_courses_sample.json")

Scraping departments: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
