In [1]:
# Cell 1: Imports & Setup
import re
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import google.generativeai as genai
import time
import os
from dotenv import load_dotenv
import psycopg2
from psycopg2.extras import RealDictCursor, Json
import logging
from datetime import datetime
from typing import Dict, List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 2: Configure API & Database
# Load environment variables
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
DATABASE_URL   = os.getenv("DATABASE_URL")

if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in .env file")
if not DATABASE_URL:
    raise ValueError("DATABASE_URL not found in .env file")

genai.configure(api_key=GEMINI_API_KEY)

BASE_URL        = "https://catalog.unc.edu"
COURSE_INDEX_URL = f"{BASE_URL}/courses/#text"


In [3]:
# Cell 3: Database Manager class
class DatabaseManager:
    def __init__(self, db_url: str):
        """Initialize database connection and caches."""
        # Parse the URL
        url = urlparse(db_url)
        
        conn_params = {
            "host": url.hostname,
            "port": url.port,
            "database": url.path[1:],
            "user": url.username,
            "password": url.password,
            "sslmode": "require",
            "gssencmode": "disable"
        }
        
        self.conn = psycopg2.connect(**conn_params)
        self.conn.autocommit = False
        self.cur = self.conn.cursor(cursor_factory=RealDictCursor)
        
        # Cache for lookups
        self.department_cache = {}
        self.course_id_cache = {}
        
        # Load existing data into cache
        self._load_cache()
    
    def _load_cache(self):
        """Load existing departments and courses into cache."""
        # Load departments
        self.cur.execute("SELECT id, code FROM departments")
        for row in self.cur.fetchall():
            self.department_cache[row['code']] = row['id']
        
        # Load course IDs
        self.cur.execute("SELECT id, course_id FROM courses")
        for row in self.cur.fetchall():
            self.course_id_cache[row['course_id']] = row['id']
        
        logger.info(f"Loaded {len(self.department_cache)} departments and {len(self.course_id_cache)} courses into cache")
    
    def get_or_create_department(self, dept_code: str) -> int:
        """Get or create a department, returning its ID."""
        if dept_code in self.department_cache:
            return self.department_cache[dept_code]
        
        self.cur.execute("""
            INSERT INTO departments (code) 
            VALUES (%s) 
            ON CONFLICT (code) DO UPDATE SET code = EXCLUDED.code
            RETURNING id
        """, (dept_code,))
        
        dept_id = self.cur.fetchone()['id']
        self.department_cache[dept_code] = dept_id
        return dept_id
    
    def save_course(self, course_data: Dict) -> Optional[int]:
        """Save a course to the database."""
        try:
            dept_id = self.get_or_create_department(course_data['department'])
            
            # Extract gen_ed as array
            gen_ed = []
            if course_data.get('gen_ed'):
                gen_ed = [course_data['gen_ed']] if isinstance(course_data['gen_ed'], str) else course_data['gen_ed']
            
            # Insert or update course
            self.cur.execute("""
                INSERT INTO courses 
                (course_id, department_id, course_number, name, description, 
                 credits, gen_ed, grading_status)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (course_id) DO UPDATE SET
                    name = EXCLUDED.name,
                    description = EXCLUDED.description,
                    credits = EXCLUDED.credits,
                    gen_ed = EXCLUDED.gen_ed,
                    grading_status = EXCLUDED.grading_status,
                    updated_at = NOW()
                RETURNING id
            """, (
                course_data['course_id'],
                dept_id,
                course_data['course_number'],
                course_data['course_name'],
                course_data.get('description'),
                course_data.get('credits'),
                gen_ed,
                course_data.get('grading_status')
            ))
            
            course_db_id = self.cur.fetchone()['id']
            self.course_id_cache[course_data['course_id']] = course_db_id
            
            # Save prerequisites if present
            if course_data.get('requisites'):
                self._save_prerequisites(course_db_id, course_data['requisites'])
            
            # Save grade requirements if present
            if course_data.get('grade_requirements'):
                self._save_grade_requirements(course_db_id, course_data['grade_requirements'])
            
            return course_db_id
            
        except Exception as e:
            logger.error(f"Error saving course {course_data.get('course_id')}: {e}")
            raise
    
    def _save_prerequisites(self, course_db_id: int, requisites: Dict):
        """Save prerequisites for a course."""
        # Clear existing prerequisites
        self.cur.execute("DELETE FROM prerequisites WHERE course_id = %s", (course_db_id,))
        
        # Save prerequisites (AND groups)
        for group_idx, prereq_group in enumerate(requisites.get('prerequisites', [])):
            for prereq_course_code in prereq_group:
                prereq_db_id = self.course_id_cache.get(prereq_course_code.strip())
                if prereq_db_id:
                    self.cur.execute("""
                        INSERT INTO prerequisites 
                        (course_id, prereq_group, prereq_course_id, is_corequisite)
                        VALUES (%s, %s, %s, %s)
                        ON CONFLICT DO NOTHING
                    """, (course_db_id, group_idx, prereq_db_id, False))
        
        # Save corequisites
        for group_idx, coreq_group in enumerate(requisites.get('corequisites', [])):
            for coreq_course_code in coreq_group:
                coreq_db_id = self.course_id_cache.get(coreq_course_code.strip())
                if coreq_db_id:
                    self.cur.execute("""
                        INSERT INTO prerequisites 
                        (course_id, prereq_group, prereq_course_id, is_corequisite)
                        VALUES (%s, %s, %s, %s)
                        ON CONFLICT DO NOTHING
                    """, (course_db_id, group_idx + 1000, coreq_db_id, True))
    
    def _save_grade_requirements(self, course_db_id: int, grade_requirements: Dict):
        """Save grade requirements for a course."""
        # Clear existing grade requirements
        self.cur.execute("DELETE FROM grade_requirements WHERE course_id = %s", (course_db_id,))
        
        for req_course_code, min_grade in grade_requirements.items():
            # Try different formats
            req_course_code = req_course_code.replace(' ', '')
            req_db_id = None
            for possible_code in [req_course_code, f"{req_course_code[:4]} {req_course_code[4:]}"]:
                req_db_id = self.course_id_cache.get(possible_code)
                if req_db_id:
                    break
            
            if req_db_id:
                self.cur.execute("""
                    INSERT INTO grade_requirements 
                    (course_id, required_course_id, minimum_grade)
                    VALUES (%s, %s, %s)
                    ON CONFLICT DO NOTHING
                """, (course_db_id, req_db_id, min_grade))
    
    def commit(self):
        """Commit the current transaction."""
        self.conn.commit()
    
    def rollback(self):
        """Rollback the current transaction."""
        self.conn.rollback()
    
    def close(self):
        """Close database connection."""
        self.cur.close()
        self.conn.close()

In [4]:
# Cell 4: RequisiteParser Class
class RequisiteParser:
    def __init__(self, model="gemini-1.5-flash", delay: float = 2.1):
        self.model = genai.GenerativeModel(model)
        self.delay = delay
        self.api_calls = 0
        self.failed_parses = []
        self.last_call_time = 0

    def parse_requisites(self, raw: str, course_id: str = None) -> dict:
        if not raw or not raw.strip():
            return {"prerequisites": [], "corequisites": [], "grade_requirements": {}, "requisites_note": None}
        now = time.time(); elapsed = now - self.last_call_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_call_time = time.time()
        self.api_calls += 1

        prompt = f"""Parse the following course requisite statement and return JSON:

    {{
      "prerequisites": [...],
      "corequisites": [...],
      "grade_requirements": {{/* course→grade */}},
      "requisites_note": null
    }}

Requisite statement:
{raw}

Return ONLY the JSON."""
        try:
            resp = self.model.generate_content(prompt)
            txt = resp.text.strip()
            txt = re.sub(r'^```json\s*','',txt)
            txt = re.sub(r'\s*```$','',txt)
            res = json.loads(txt)
            return self._validate_result(res)
        except Exception as e:
            if course_id:
                self.failed_parses.append((course_id,str(e)))
            return self._fallback_parse(raw)

    def _validate_result(self, res: dict) -> dict:
        out = {
            "prerequisites": [],
            "corequisites": [],
            "grade_requirements": res.get("grade_requirements", {}),
            "requisites_note": res.get("requisites_note")
        }
        for key in ["prerequisites","corequisites"]:
            lst = res.get(key,[])
            cleaned=[]
            for item in lst:
                if isinstance(item,list):
                    cleaned.append(item)
                elif isinstance(item,str):
                    cleaned.append([item])
            out[key]=cleaned
        if not isinstance(out["grade_requirements"],dict):
            out["grade_requirements"]={}
        return out

    def _fallback_parse(self, raw: str) -> dict:
        pat = re.compile(r'\b[A-Z]{2,5}\s?\d{2,3}[A-Z]?\d?\b')
        codes = pat.findall(raw)
        norm=[]
        for c in codes:
            if ' ' not in c:
                c=re.sub(r'([A-Z]+)(\d)',r'\1 \2',c)
            norm.append(c)
        prereqs=[[c] for c in norm]
        grades={}
        if 'C or better' in raw or 'grade of C' in raw:
            for c in norm: grades[c.replace(' ','')] = 'C'
        note=None
        if 'permission' in raw.lower(): note="Permission of instructor may be required"
        return {"prerequisites":prereqs, "corequisites":[], "grade_requirements":grades, "requisites_note":note}

In [5]:
# Cell 5: Scraping Functions
def get_department_links(only=None):
    resp = requests.get(COURSE_INDEX_URL)
    soup = BeautifulSoup(resp.text,"html.parser")
    idx = soup.find("div",{"id":"atozindex"})
    links=[]
    for a in idx.find_all("a",href=True):
        code=a['href'].split("/")[-2].upper()
        if only is None or code in only:
            links.append((code,urljoin(BASE_URL,a['href'])))
    return links

def parse_course_block(block, parser: RequisiteParser):
    data={k:None for k in ["department","course_number","course_id","course_name","credits","description"]}
    data.update({"requisites":{"prerequisites":[],"corequisites":[]},"grade_requirements":{},"requisites_note":None,"gen_ed":None,"grading_status":None})
    header=block.find("div",class_="cols noindent")
    if header:
        sts=header.find_all("strong")
        if len(sts)>=3:
            code=sts[0].text.strip()
            if " " in code:
                d,n=code.split(" ",1)
                n=n.rstrip(".")
                data["department"],data["course_number"]=d,n
                data["course_id"]=f"{d} {n}"
            data["course_name"]=sts[1].text.strip()
            data["credits"]=sts[2].text.strip().replace(" Credits.","")
    desc=block.find("p",class_="courseblockextra")
    if desc: data["description"]=desc.text.strip()
    req=block.find("span",class_="text detail-requisites margin--default")
    if req:
        cid=data.get("course_id","Unknown")
        rd=parser.parse_requisites(req.text,cid)
        data["requisites"]= {"prerequisites":rd["prerequisites"],"corequisites":rd["corequisites"]}
        data["grade_requirements"]=rd["grade_requirements"]
        data["requisites_note"]=rd["requisites_note"]
    idea=block.find("span",class_="text detail-idea_action margin--default")
    if idea: data["gen_ed"]=idea.text.strip().replace("IDEAs in Action Gen Ed:","")
    grade=block.find("span",class_="text detail-grading_status margin--default")
    if grade: data["grading_status"]=grade.text.strip().replace("Grading Status: ","")
    return data

def parse_department(url, parser: RequisiteParser, dept_code: str, db_manager: DatabaseManager, mode: str = 'database'):
    """Parse all courses from a department page."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    course_blocks = soup.find_all("div", class_="courseblock")
    
    total_courses = len(course_blocks)
    print(f"   Found {total_courses} courses to parse")
    
    courses = []
    saved_count = 0
    
    for i, cb in enumerate(course_blocks, 1):
        # Extract course ID for progress display
        header = cb.find("div", class_="cols noindent")
        course_id = "Unknown"
        if header:
            strong_tags = header.find_all("strong")
            if strong_tags:
                course_id = strong_tags[0].text.strip()
        
        print(f"   Processing {course_id} ({i}/{total_courses})...", end='\r')
        
        course_data = parse_course_block(cb, parser)
        
        # Save to database if in database mode
        if mode in ['database', 'both'] and db_manager:
            try:
                db_manager.save_course(course_data)
                saved_count += 1
            except Exception as e:
                logger.error(f"Failed to save {course_id}: {e}")
        
        # Collect for JSON if needed
        if mode in ['json', 'both']:
            courses.append(course_data)
    
    print(f"   ✓ Completed all {total_courses} courses in {dept_code} (saved {saved_count} to database)     ")
    return courses

def scrape_all_courses(parser: RequisiteParser, db_manager: Optional[DatabaseManager] = None, 
                      only=None, mode='database', dry_run=False, update_existing=True):
    """
    Scrape all courses with flexible output options.
    
    Args:
        parser: RequisiteParser instance
        db_manager: DatabaseManager instance (required for database mode)
        only: Set of department codes to scrape (None for all)
        mode: 'database', 'json', or 'both'
        dry_run: If True, don't actually save anything
        update_existing: If True, update existing courses; if False, skip them
    """
    department_links = get_department_links(only=only)
    all_courses = {}
    
    print(f"\n🎯 Starting scrape of {len(department_links)} departments")
    print(f"   Mode: {mode}")
    print(f"   Dry run: {dry_run}")
    print(f"   Update existing: {update_existing}\n")
    
    overall_start_time = time.time()

    for dept_idx, (dept_code, url) in enumerate(department_links, 1):
        try:
            print(f"📚 [{dept_idx}/{len(department_links)}] Scraping {dept_code}...")
            dept_start_time = time.time()
            
            # Begin transaction for this department
            if db_manager and not dry_run:
                db_manager.conn.commit()  # Commit any pending changes
            
            courses = parse_department(url, parser, dept_code, db_manager if not dry_run else None, mode)
            
            if mode in ['json', 'both']:
                all_courses[dept_code] = courses
            
            # Commit department transaction
            if db_manager and not dry_run and mode in ['database', 'both']:
                db_manager.commit()
            
            dept_elapsed = time.time() - dept_start_time
            print(f"✅ Successfully scraped {dept_code} in {dept_elapsed/60:.1f} minutes\n")
            
        except Exception as e:
            print(f"❌ Error scraping {dept_code}: {e}\n")
            if db_manager and not dry_run:
                db_manager.rollback()
    
    overall_elapsed = time.time() - overall_start_time
    print(f"⏱️  Total scraping time: {overall_elapsed/60:.1f} minutes")
    
    return all_courses

def save_to_json(data, filename="unc_courses.json"):
    """Save course data to JSON file."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n💾 Saved to {filename}")

In [6]:
# Cell 6: Main Scrape & Save
parser     = RequisiteParser(model="gemini-1.5-flash", delay=0.5)
db_manager = DatabaseManager(DATABASE_URL)

MODE           = 'database'   # 'database', 'json', or 'both'
DRY_RUN        = False
UPDATE_EXISTING= True
sample_depts   = {"AAAD", "AERO"}

courses = scrape_all_courses(
    parser,
    db_manager,
    only=sample_depts,
    mode=MODE,
    dry_run=DRY_RUN,
    update_existing=UPDATE_EXISTING
)

if MODE in ['json','both']:
    save_to_json(courses,"unc_courses_sample.json")

print(f"\n📊 API calls: {parser.api_calls}, Failed parses: {len(parser.failed_parses)}")
db_manager.close()


2025-07-21 12:33:18,055 - INFO - Loaded 1 departments and 123 courses into cache



🎯 Starting scrape of 1 departments
   Mode: database
   Dry run: False
   Update existing: True

📚 [1/1] Scraping AERO...
   Found 13 courses to parse
   ✓ Completed all 13 courses in AERO (saved 13 to database)     
✅ Successfully scraped AERO in 1.9 minutes

⏱️  Total scraping time: 1.9 minutes

📊 API calls: 4, Failed parses: 0


In [7]:
# Cell 7: Verification / Lookup
def verify_scraping_results():
    from db_queries import CourseDatabase
    with CourseDatabase() as db:
        stats = db.get_database_stats()
        print(f"\n🔍 DB Stats: {stats}")
        for cid in ["COMP 110","COMP 211","BIOL 101"]:
            course = db.get_course(cid)
            prereqs = db.get_course_prerequisites(cid) if course else {}
            print(f"{cid}: {course['name'] if course else 'N/A'} - prereqs: {len(prereqs.get('prerequisites',[]))}")

verify_scraping_results()

OperationalError: connection to server at "localhost" (::1), port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?


In [None]:
# Cell to count non-empty requisites across all departments
def count_requisites(only=None):
    """Count how many courses have non-empty requisites across departments."""
    department_links = get_department_links(only=only)
    
    total_courses = 0
    courses_with_requisites = 0
    dept_stats = {}
    
    print(f"🔍 Analyzing {len(department_links)} departments...\n")
    
    for dept_code, url in tqdm(department_links, desc="Scanning departments"):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        course_blocks = soup.find_all("div", class_="courseblock")
        
        dept_total = len(course_blocks)
        dept_with_reqs = 0
        
        for block in course_blocks:
            req_span = block.find("span", class_="text detail-requisites margin--default")
            if req_span and req_span.text.strip() and req_span.text.strip() != "Requisites:":
                dept_with_reqs += 1
        
        dept_stats[dept_code] = {
            "total": dept_total,
            "with_requisites": dept_with_reqs,
            "percentage": (dept_with_reqs / dept_total * 100) if dept_total > 0 else 0
        }
        
        total_courses += dept_total
        courses_with_requisites += dept_with_reqs
    
    # Print summary
    print(f"\n📊 Requisite Analysis Complete!\n")
    print(f"Total courses across all departments: {total_courses}")
    print(f"Courses with requisites: {courses_with_requisites}")
    print(f"Courses without requisites: {total_courses - courses_with_requisites}")
    print(f"Percentage with requisites: {courses_with_requisites/total_courses*100:.1f}%")
    print(f"\n💡 You will need {courses_with_requisites} API calls")
    print(f"⏱️  Estimated time at 2.1s/call: {courses_with_requisites * 2.1 / 60:.1f} minutes")
    
    # Show top departments by requisite count
    print(f"\n📈 Top 10 departments by requisite count:")
    sorted_depts = sorted(dept_stats.items(), key=lambda x: x[1]['with_requisites'], reverse=True)[:10]
    for dept, stats in sorted_depts:
        print(f"   {dept}: {stats['with_requisites']}/{stats['total']} courses ({stats['percentage']:.0f}%)")
    
    return dept_stats

# Run the analysis
# For all departments:
dept_stats = count_requisites()

# Or for specific departments:
# dept_stats = count_requisites(only={"COMP", "MATH", "BIOL", "CHEM", "PHYS"})

🔍 Analyzing 152 departments...



Scanning departments: 100%|██████████| 152/152 [01:04<00:00,  2.35it/s]


📊 Requisite Analysis Complete!

Total courses across all departments: 10212
Courses with requisites: 2759
Courses without requisites: 7453
Percentage with requisites: 27.0%

💡 You will need 2759 API calls
⏱️  Estimated time at 2.1s/call: 96.6 minutes

📈 Top 10 departments by requisite count:
   BIOL: 167/264 courses (63%)
   PSYC: 101/183 courses (55%)
   NURS: 91/169 courses (54%)
   ECON: 81/145 courses (56%)
   COMP: 79/109 courses (72%)
   PHCY: 75/112 courses (67%)
   CHEM: 74/112 courses (66%)
   SPAN: 71/119 courses (60%)
   MATH: 70/110 courses (64%)
   COMM: 69/213 courses (32%)



