In [None]:
# Cell 1: Imports & Setup
import re
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import google.generativeai as genai
import time
import os
from dotenv import load_dotenv

In [8]:
# Cell 2: Configure API & Constants
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in .env file")

genai.configure(api_key=GEMINI_API_KEY)

BASE_URL = "https://catalog.unc.edu"
COURSE_INDEX_URL = f"{BASE_URL}/courses/#text"

In [12]:
# Cell 3: RequisiteParser Class
class RequisiteParser:
    def __init__(self, delay: float = 0.2):
        self.model = genai.GenerativeModel('gemini-2.0-flash-lite')
        self.delay = delay
        self.api_calls = 0
        self.failed_parses = []

    def parse_requisites(self, raw: str, course_id: str = None) -> dict:
        if not raw or not raw.strip():
            return {
                "prerequisites": [],
                "corequisites": [],
                "grade_requirements": {},
                "requisites_note": None
            }
        if self.api_calls > 0:
            time.sleep(self.delay)
        self.api_calls += 1

        prompt = f"""Parse the following course requisite statement and return a JSON object with this exact structure:

{{
    "prerequisites": [ /* AND-groups of OR-alternatives */ ],
    "corequisites": [ /* same */ ],
    "grade_requirements": {{ /* course → grade */ }},
    "requisites_note": null
}}

Course Requisite:
{raw}

Return ONLY the JSON."""
        try:
            response = self.model.generate_content(prompt)
            json_text = response.text.strip()
            json_text = re.sub(r'^```json\s*', '', json_text)
            json_text = re.sub(r'\s*```$', '', json_text)
            result = json.loads(json_text)
            return self._validate_result(result)
        except Exception as e:
            if course_id:
                self.failed_parses.append((course_id, str(e)))
            return self._fallback_parse(raw)

    def _validate_result(self, result: dict) -> dict:
        validated = {
            "prerequisites": result.get("prerequisites", []),
            "corequisites": result.get("corequisites", []),
            "grade_requirements": result.get("grade_requirements", {}),
            "requisites_note": result.get("requisites_note", None)
        }
        for key in ["prerequisites", "corequisites"]:
            lst = validated[key]
            if not isinstance(lst, list):
                validated[key] = []
            else:
                cleaned = []
                for item in lst:
                    if isinstance(item, list):
                        cleaned.append(item)
                    elif isinstance(item, str):
                        cleaned.append([item])
                validated[key] = cleaned
        if not isinstance(validated["grade_requirements"], dict):
            validated["grade_requirements"] = {}
        return validated

    def _fallback_parse(self, raw: str) -> dict:
        COURSE_RE = re.compile(r'\b[A-Z]{2,5}\s?\d{2,3}[A-Z]?\d?\b')
        courses = COURSE_RE.findall(raw)
        normalized = []
        for c in courses:
            if ' ' not in c:
                c = re.sub(r'([A-Z]+)(\d)', r'\1 \2', c)
            normalized.append(c)
        prereqs = [[c] for c in normalized]
        grades = {}
        if 'C or better' in raw or 'grade of C' in raw:
            for c in normalized:
                grades[c] = 'C'
        note = None
        if 'permission' in raw.lower() or 'instructor' in raw.lower():
            note = "Permission of instructor may be required"
        return {
            "prerequisites": prereqs,
            "corequisites": [],
            "grade_requirements": grades,
            "requisites_note": note
        }

In [18]:
# Cell 4: Scraping Functions
def get_department_links(only=None):
    response = requests.get(COURSE_INDEX_URL)
    soup = BeautifulSoup(response.text, "html.parser")
    index_div = soup.find("div", {"id": "atozindex"})
    links = []
    for a in index_div.find_all("a", href=True):
        dept = a['href'].split("/")[-2].upper()
        if only is None or dept in only:
            links.append((dept, urljoin(BASE_URL, a['href'])))
    return links

def parse_course_block(block, parser: RequisiteParser):
    data = {
        "department": None,
        "course_number": None,
        "course_id": None,
        "course_name": None,
        "credits": None,
        "description": None,
        "requisites": {"prerequisites": [], "corequisites": []},
        "grade_requirements": {},
        "requisites_note": None,
        "gen_ed": None,
        "grading_status": None
    }
    header = block.find("div", class_="cols noindent")
    if header:
        strongs = header.find_all("strong")
        if len(strongs) >= 3:
            code = strongs[0].text.strip()
            if " " in code:
                dept, num = code.split(" ", 1)
                num = num.rstrip(".")
                data["department"], data["course_number"] = dept, num
                data["course_id"] = f"{dept} {num}"
            data["course_name"] = strongs[1].text.strip()
            data["credits"] = strongs[2].text.strip().replace(" Credits.", "")
    desc = block.find("p", class_="courseblockextra")
    if desc:
        data["description"] = desc.text.strip()
    req = block.find("span", class_="text detail-requisites margin--default")
    if req:
        cid = data.get("course_id", None)
        parsed = parser.parse_requisites(req.text, cid)
        data["requisites"] = {
            "prerequisites": parsed["prerequisites"],
            "corequisites": parsed["corequisites"]
        }
        data["grade_requirements"] = parsed["grade_requirements"]
        data["requisites_note"] = parsed["requisites_note"]
    idea = block.find("span", class_="text detail-idea_action margin--default")
    if idea:
        data["gen_ed"] = idea.text.strip().replace("IDEAs in Action Gen Ed:", "")
    grade = block.find("span", class_="text detail-grading_status margin--default")
    if grade:
        data["grading_status"] = grade.text.strip().replace("Grading Status: ", "")
    return data

def parse_department(url, parser: RequisiteParser, dept_code: str = None):
    """Parse all courses from a department page."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    course_blocks = soup.find_all("div", class_="courseblock")
    
    total_courses = len(course_blocks)
    print(f"   Found {total_courses} courses to parse")
    
    courses = []
    for i, cb in enumerate(course_blocks, 1):
        # Extract course ID for progress display
        header = cb.find("div", class_="cols noindent")
        course_id = "Unknown"
        if header:
            strong_tags = header.find_all("strong")
            if strong_tags:
                course_id = strong_tags[0].text.strip()
        
        print(f"   Processing {course_id} ({i}/{total_courses})...", end='\r')
        courses.append(parse_course_block(cb, parser))
    
    print(f"   ✓ Completed all {total_courses} courses in {dept_code}     ")
    return courses

def scrape_all_courses(parser: RequisiteParser, only=None):
    """Scrape all courses using the LLM-based parser."""
    department_links = get_department_links(only=only)
    all_courses = {}
    
    print(f"\n🎯 Starting scrape of {len(department_links)} departments\n")

    for dept_idx, (dept_code, url) in enumerate(department_links, 1):
        try:
            print(f"📚 [{dept_idx}/{len(department_links)}] Scraping {dept_code}...")
            dept_start_time = time.time()
            
            courses = parse_department(url, parser, dept_code)
            all_courses[dept_code] = courses
            
            dept_elapsed = time.time() - dept_start_time
            print(f"✅ Successfully scraped {dept_code} in {dept_elapsed/60:.1f} minutes\n")
        except Exception as e:
            print(f"❌ Error scraping {dept_code}: {e}\n")
    
    return all_courses

def save_to_json(data, filename="unc_courses.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"💾 Saved to {filename}")

In [None]:
# Cell 5: Main Execution
parser = RequisiteParser(delay=2.1)

# Overall timing
overall_start_time = time.time()

# Scrape departments
sample_departments = {"COMP"}
courses = scrape_all_courses(parser, only=sample_departments)
save_to_json(courses, "output/unc_courses.json")

overall_elapsed = time.time() - overall_start_time

# Statistics
print(f"\n📊 Statistics:")
print(f"   Total scraping time: {overall_elapsed/60:.1f} minutes")
print(f"   Total API calls: {parser.api_calls}")
print(f"   Failed parses: {len(parser.failed_parses)}")

if parser.failed_parses:
    print("\n⚠️  Failed to parse requisites for:")
    for course_id, error in parser.failed_parses[:5]:
        print(f"   - {course_id}: {error}")
    if len(parser.failed_parses) > 5:
        print(f"   ... and {len(parser.failed_parses) - 5} more")


🎯 Starting scrape of 1 departments

📚 [1/1] Scraping COMP...
   Found 109 courses to parse
   ✓ Completed all 109 courses in COMP     
✅ Successfully scraped COMP in 3.0 minutes

💾 Saved to unc_courses.json

📊 Statistics:
⏱️  Total scraping time: 3.0 minutes
   Total API calls: 79
   Failed parses: 68

⚠️  Failed to parse requisites for:
   - COMP 401: 429 You exceeded your current quota, please check ...
   - COMP 410: 429 You exceeded your current quota, please check ...
   - COMP 411: 429 You exceeded your current quota, please check ...
   - COMP 421: 429 You exceeded your current quota, please check ...
   - COMP 423: 429 You exceeded your current quota, please check ...
   ... and 63 more


In [20]:
# Cell to count non-empty requisites across all departments
def count_requisites(only=None):
    """Count how many courses have non-empty requisites across departments."""
    department_links = get_department_links(only=only)
    
    total_courses = 0
    courses_with_requisites = 0
    dept_stats = {}
    
    print(f"🔍 Analyzing {len(department_links)} departments...\n")
    
    for dept_code, url in tqdm(department_links, desc="Scanning departments"):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        course_blocks = soup.find_all("div", class_="courseblock")
        
        dept_total = len(course_blocks)
        dept_with_reqs = 0
        
        for block in course_blocks:
            req_span = block.find("span", class_="text detail-requisites margin--default")
            if req_span and req_span.text.strip() and req_span.text.strip() != "Requisites:":
                dept_with_reqs += 1
        
        dept_stats[dept_code] = {
            "total": dept_total,
            "with_requisites": dept_with_reqs,
            "percentage": (dept_with_reqs / dept_total * 100) if dept_total > 0 else 0
        }
        
        total_courses += dept_total
        courses_with_requisites += dept_with_reqs
    
    # Print summary
    print(f"\n📊 Requisite Analysis Complete!\n")
    print(f"Total courses across all departments: {total_courses}")
    print(f"Courses with requisites: {courses_with_requisites}")
    print(f"Courses without requisites: {total_courses - courses_with_requisites}")
    print(f"Percentage with requisites: {courses_with_requisites/total_courses*100:.1f}%")
    print(f"\n💡 You will need {courses_with_requisites} API calls")
    print(f"⏱️  Estimated time at 2.1s/call: {courses_with_requisites * 2.1 / 60:.1f} minutes")
    
    # Show top departments by requisite count
    print(f"\n📈 Top 10 departments by requisite count:")
    sorted_depts = sorted(dept_stats.items(), key=lambda x: x[1]['with_requisites'], reverse=True)[:10]
    for dept, stats in sorted_depts:
        print(f"   {dept}: {stats['with_requisites']}/{stats['total']} courses ({stats['percentage']:.0f}%)")
    
    return dept_stats

# Run the analysis
# For all departments:
dept_stats = count_requisites()

# Or for specific departments:
# dept_stats = count_requisites(only={"COMP", "MATH", "BIOL", "CHEM", "PHYS"})

🔍 Analyzing 152 departments...



Scanning departments: 100%|██████████| 152/152 [01:04<00:00,  2.35it/s]


📊 Requisite Analysis Complete!

Total courses across all departments: 10212
Courses with requisites: 2759
Courses without requisites: 7453
Percentage with requisites: 27.0%

💡 You will need 2759 API calls
⏱️  Estimated time at 2.1s/call: 96.6 minutes

📈 Top 10 departments by requisite count:
   BIOL: 167/264 courses (63%)
   PSYC: 101/183 courses (55%)
   NURS: 91/169 courses (54%)
   ECON: 81/145 courses (56%)
   COMP: 79/109 courses (72%)
   PHCY: 75/112 courses (67%)
   CHEM: 74/112 courses (66%)
   SPAN: 71/119 courses (60%)
   MATH: 70/110 courses (64%)
   COMM: 69/213 courses (32%)



