In [15]:
# Cell 1: Imports & Setup
import re
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import google.generativeai as genai
import time
import os
from dotenv import load_dotenv
import psycopg2
from psycopg2.extras import RealDictCursor, Json
import logging
from typing import Dict, List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [2]:
# Cell 2: Configure API & Database
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
DATABASE_URL   = os.getenv("DATABASE_URL")

if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in .env file")
if not DATABASE_URL:
    raise ValueError("DATABASE_URL not found in .env file")

genai.configure(api_key=GEMINI_API_KEY)

BASE_URL     = "https://catalog.unc.edu"
PROGRAMS_URL = f"{BASE_URL}/undergraduate/programs-study/"

In [None]:
# Cell 3: ProgramDatabaseManager Class
class ProgramDatabaseManager:
    def __init__(self, db_url: str):
        # Parse the URL
        url = urlparse(db_url)
        
        conn_params = {
            "host": url.hostname,
            "port": url.port,
            "database": url.path[1:],
            "user": url.username,
            "password": url.password,
            "sslmode": "require",
            "gssencmode": "disable"
        }
        
        self.conn = psycopg2.connect(**conn_params)
        self.conn.autocommit = False
        self.cur = self.conn.cursor(cursor_factory=RealDictCursor)
        self.course_id_cache = {}
        self._load_course_cache()
    
    def _load_course_cache(self):
        self.cur.execute("SELECT id, course_id FROM courses")
        for row in self.cur.fetchall():
            self.course_id_cache[row['course_id']] = row['id']
        logger.info(f"Loaded {len(self.course_id_cache)} courses into cache")
    
    def save_program(self, program_data: Dict) -> Optional[int]:
        try:
            self.cur.execute("""
                INSERT INTO programs 
                  (program_id, name, program_type, degree_type, total_hours, url)
                VALUES (%s,%s,%s,%s,%s,%s)
                ON CONFLICT (program_id) DO UPDATE SET
                  name=EXCLUDED.name,
                  program_type=EXCLUDED.program_type,
                  degree_type=EXCLUDED.degree_type,
                  total_hours=EXCLUDED.total_hours,
                  url=EXCLUDED.url,
                  updated_at=NOW()
                RETURNING id
            """, (
                program_data.get('program_id'),
                program_data.get('program_name'),
                program_data.get('program_type', 'major'),
                program_data.get('degree_type'),
                program_data.get('total_hours'),
                program_data.get('url')
            ))
            prog_id = self.cur.fetchone()['id']
            if program_data.get('requirements'):
                self._save_program_requirements(prog_id, program_data['requirements'])
            return prog_id
        except Exception as e:
            logger.error(f"Error saving program {program_data.get('program_name')}: {e}")
            raise
    
    def _save_program_requirements(self, program_db_id: int, requirements: Dict):
        self.cur.execute("""
            DELETE FROM program_requirement_courses 
            WHERE requirement_id IN (
              SELECT id FROM program_requirements WHERE program_id=%s
            )
        """, (program_db_id,))
        self.cur.execute("DELETE FROM program_requirements WHERE program_id=%s", (program_db_id,))
        
        display_order = 0
        mappings = [
            ('gateway_courses', 'gateway'),
            ('core_requirements','core'),
            ('electives','elective'),
            ('allied_sciences','allied_science')
        ]
        for json_key, req_type in mappings:
            items = requirements.get(json_key, [])
            for item in items:
                category    = item.get('category', json_key)
                min_credits = item.get('min_credits', item.get('total_credits'))
                min_courses = item.get('min_courses')
                select_note = item.get('selection_notes') or item.get('notes')
                level_req   = item.get('level_requirement')
                other_rest  = item.get('restrictions') or item.get('other_restrictions')
                
                self.cur.execute("""
                    INSERT INTO program_requirements
                      (program_id, requirement_type, category_name, min_credits,
                       min_courses, selection_notes, level_requirement,
                       other_restrictions, display_order)
                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
                    RETURNING id
                """, (
                    program_db_id, req_type, category, min_credits,
                    min_courses, select_note, level_req,
                    other_rest, display_order
                ))
                req_id = self.cur.fetchone()['id']
                display_order += 1
                self._save_requirement_courses(req_id, item)
    
    def _save_requirement_courses(self, req_id: int, item: Dict):
        courses = []
        if isinstance(item.get('courses'), list):
            for c in item['courses']:
                if isinstance(c, dict):
                    courses.append((c.get('course_code') or c.get('course'), True))
                else:
                    courses.append((c, True))
        elif item.get('course_code'):
            courses.append((item['course_code'], True))
        for code, required in courses:
            cid = self.course_id_cache.get(code)
            if cid:
                try:
                    self.cur.execute("""
                        INSERT INTO program_requirement_courses
                          (requirement_id, course_id, is_required)
                        VALUES (%s,%s,%s)
                        ON CONFLICT (requirement_id, course_id) DO NOTHING
                    """, (req_id, cid, required))
                except Exception as e:
                    logger.warning(f"Failed linking {code}: {e}")
    
    def commit(self):   self.conn.commit()
    def rollback(self): self.conn.rollback()
    def close(self):    self.cur.close(); self.conn.close()

In [10]:
# Cell 4: RequirementsParser Class
class RequirementsParser:
    def __init__(self, model="gemini-1.5-flash", delay: float = 2.1):
        self.model = genai.GenerativeModel(model)
        self.delay = delay
        self.api_calls = 0
        self.failed_parses = []
        self.last_call_time = 0

    def parse_requirements(self, html_content: str, program_name: str = None) -> dict:
        soup = BeautifulSoup(html_content, 'html.parser')
        req_div = soup.find('div', {'id':'requirementstextcontainer'}) or soup.find('div',{'id':'right-col'})
        if not req_div:
            return {"error":"No requirements content found","requirements":{}}
        html = str(req_div)
        now = time.time(); elapsed = now - self.last_call_time
        if elapsed < self.delay: time.sleep(self.delay - elapsed)
        self.last_call_time = time.time(); self.api_calls += 1

        prompt = f"""Parse the following HTML into JSON (program_type,degree_type,total_hours,requirements,footnotes,special_notes):

{html}

Return ONLY JSON."""
        try:
            resp = self.model.generate_content(prompt)
            txt  = resp.text.strip()
            txt  = re.sub(r'^```json\s*','',txt)
            txt  = re.sub(r'\s*```$','',txt)
            return json.loads(txt)
        except Exception as e:
            if program_name: self.failed_parses.append((program_name,str(e)))
            return {"error":f"Failed to parse: {e}","requirements":{},"program_name":program_name}

In [11]:
# Cell 5: Scraping Functions
def get_program_links():
    resp = requests.get(PROGRAMS_URL)
    soup = BeautifulSoup(resp.text, 'html.parser')
    sitemap = soup.find('div', {'class':'az_sitemap'})
    if not sitemap: return []
    links=[]
    for a in sitemap.find_all('a',href=True):
        href, name = a['href'], a.text.strip()
        if 'programs-study' not in href: continue
        url = urljoin(BASE_URL, href)
        if '#' not in url: url += '#requirementstext'
        links.append({'name':name,'url':url,'program_id':href.split('/')[-2]})
    return links

def scrape_program(url: str, parser: RequirementsParser, info: dict,
                   db_mgr: Optional[ProgramDatabaseManager]=None, mode: str='database'):
    try:
        resp = requests.get(url); resp.raise_for_status()
        result = parser.parse_requirements(resp.text, info['name'])
        result.update({'program_name':info['name'],'program_id':info['program_id'],'url':url})
        if mode in ['database','both'] and db_mgr and 'error' not in result:
            try: db_mgr.save_program(result)
            except Exception as e:
                logger.error(f"Save error {info['name']}: {e}")
                result['database_error']=str(e)
        return result
    except Exception as e:
        logger.error(f"Error scraping {info['name']}: {e}")
        return {'program_name':info['name'],'program_id':info['program_id'],'url':url,'error':str(e)}

def scrape_all_programs(parser: RequirementsParser, db_mgr: Optional[ProgramDatabaseManager]=None,
                       limit: Optional[int]=None, mode: str='database', dry_run: bool=False):
    print("🔍 Finding all program links...")
    prog_links = get_program_links()
    if limit: prog_links = prog_links[:limit]
    print(f"🎯 Found {len(prog_links)} programs to scrape — mode={mode}, dry_run={dry_run}\n")
    all_progs=[]; saved=0; start_all=time.time()
    for idx, info in enumerate(prog_links,1):
        print(f"📚 [{idx}/{len(prog_links)}] Scraping {info['name']}...")
        if db_mgr and not dry_run: db_mgr.conn.commit()
        start=time.time()
        res=scrape_program(info['url'],parser,info,db_mgr if not dry_run else None,mode)
        if mode in ['json','both']: all_progs.append(res)
        if db_mgr and not dry_run and mode in ['database','both'] and 'error' not in res:
            try: db_mgr.commit(); saved+=1
            except: db_mgr.rollback()
        print(f"{'✅' if 'error' not in res else '⚠️'} Completed in {time.time()-start:.1f}s\n")
    print(f"⏱️ Total time: {(time.time()-start_all)/60:.1f}m — saved {saved} programs")
    return all_progs

def save_to_json(data, filename="unc_programs.json"):
    with open(filename,"w",encoding="utf-8") as f:
        json.dump(data,f,indent=2,ensure_ascii=False)
    print(f"💾 Saved to {filename}")

In [12]:
# Cell 6: Analysis Utilities
def analyze_programs(programs):
    total = len(programs)
    success = len([p for p in programs if 'error' not in p])
    failed  = total - success
    print(f"\n📊 Summary: total={total}, success={success}, failed={failed}")
    if failed:
        print("\n⚠️ Failed programs:")
        for p in programs:
            if 'error' in p:
                print(f" - {p['program_name']}: {p['error'][:50]}...")

def find_program(programs, term):
    term = term.lower()
    matches=[p for p in programs if term in p['program_name'].lower()]
    print(f"\n🔍 Found {len(matches)} programs matching '{term}':")
    for p in matches:
        print(f" • {p['program_name']} ({p['program_type']})")

In [13]:
# Cell 7: Main Execution
parser     = RequirementsParser(model="gemini-1.5-flash", delay=2.1)
db_manager = ProgramDatabaseManager(DATABASE_URL)

MODE   = 'database'   # 'database','json', or 'both'
DRY    = False
LIMIT  = 5            # None for all programs

print("🧪 Testing with first 5 programs...\n")
programs = scrape_all_programs(parser, db_manager, limit=LIMIT, mode=MODE, dry_run=DRY)
if MODE in ['json','both']:
    save_to_json(programs, "unc_programs_test.json")
    analyze_programs(programs)

print(f"\n📊 API calls: {parser.api_calls}, Failed parses: {len(parser.failed_parses)}")
db_manager.close()

2025-07-20 23:32:25,304 - INFO - Loaded 123 courses into cache


🧪 Testing with first 5 programs...

🔍 Finding all program links...
🎯 Found 5 programs to scrape — mode=database, dry_run=False

📚 [1/5] Scraping Aerospace Studies Minor...


2025-07-20 23:32:27,356 - ERROR - Error saving program Aerospace Studies Minor: null value in column "program_type" of relation "programs" violates not-null constraint
DETAIL:  Failing row contains (1, aerospace-studies-minor, Aerospace Studies Minor, null, null, 14, https://catalog.unc.edu/undergraduate/programs-study/aerospace-s..., 2025-07-21 04:32:27.274573, 2025-07-21 04:32:27.274573).

2025-07-20 23:32:27,357 - ERROR - Save error Aerospace Studies Minor: null value in column "program_type" of relation "programs" violates not-null constraint
DETAIL:  Failing row contains (1, aerospace-studies-minor, Aerospace Studies Minor, null, null, 14, https://catalog.unc.edu/undergraduate/programs-study/aerospace-s..., 2025-07-21 04:32:27.274573, 2025-07-21 04:32:27.274573).



✅ Completed in 1.8s

📚 [2/5] Scraping African American and Diaspora Studies Minor...


2025-07-20 23:32:34,012 - ERROR - Error saving program African American and Diaspora Studies Minor: 'list' object has no attribute 'get'
2025-07-20 23:32:34,013 - ERROR - Save error African American and Diaspora Studies Minor: 'list' object has no attribute 'get'


✅ Completed in 6.7s

📚 [3/5] Scraping African Studies Minor...


2025-07-20 23:32:38,146 - ERROR - Error saving program African Studies Minor: value too long for type character varying(10)

2025-07-20 23:32:38,146 - ERROR - Save error African Studies Minor: value too long for type character varying(10)



✅ Completed in 4.1s

📚 [4/5] Scraping African, African American, and Diaspora Studies Major, B.A....


2025-07-20 23:32:40,286 - ERROR - Error saving program African, African American, and Diaspora Studies Major, B.A.: null value in column "program_type" of relation "programs" violates not-null constraint
DETAIL:  Failing row contains (3, african-african-american-diaspora-studies-major-ba, African, African American, and Diaspora Studies Major, B.A., null, null, 27, https://catalog.unc.edu/undergraduate/programs-study/african-afr..., 2025-07-21 04:32:40.241133, 2025-07-21 04:32:40.241133).

2025-07-20 23:32:40,286 - ERROR - Save error African, African American, and Diaspora Studies Major, B.A.: null value in column "program_type" of relation "programs" violates not-null constraint
DETAIL:  Failing row contains (3, african-african-american-diaspora-studies-major-ba, African, African American, and Diaspora Studies Major, B.A., null, null, 27, https://catalog.unc.edu/undergraduate/programs-study/african-afr..., 2025-07-21 04:32:40.241133, 2025-07-21 04:32:40.241133).



✅ Completed in 2.1s

📚 [5/5] Scraping American Indian and Indigenous Studies Minor...


2025-07-20 23:32:42,676 - ERROR - Error saving program American Indian and Indigenous Studies Minor: value too long for type character varying(10)

2025-07-20 23:32:42,677 - ERROR - Save error American Indian and Indigenous Studies Minor: value too long for type character varying(10)



✅ Completed in 2.4s

⏱️ Total time: 0.3m — saved 5 programs

📊 API calls: 5, Failed parses: 0


In [20]:
# Cell 8: Verification / Lookup
def verify_program_scraping():
    from db_queries import CourseDatabase
    with CourseDatabase() as db:
        stats = db.get_database_stats()
        print(f"\n🔍 Program Stats: {stats}")
        compsci = db.search_programs("computer science")
        print(f"\n📚 Found {len(compsci)} CS programs:")
        for prog in compsci:
            print(f" - {prog['name']} ({prog['program_type']})")
            reqs = db.get_program_requirements(prog['program_id'])
            print(f"   Categories: {len(reqs)}; sample: {reqs[0]['category_name'] if reqs else 'N/A'}")

verify_program_scraping()

OperationalError: could not translate host name "db.xqovabeviuvdtqjeaomo.supabase.co" to address: Name or service not known
