In [1]:
# Cell 1: Imports & Setup
import re
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
import google.generativeai as genai
import time
import os
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 2: Configure API & Constants
# Load environment variables from .env file
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in .env file")

genai.configure(api_key=GEMINI_API_KEY)

BASE_URL = "https://catalog.unc.edu"
PROGRAMS_URL = f"{BASE_URL}/undergraduate/programs-study/"

In [None]:
# Cell 3: RequirementsParser Class
class RequirementsParser:
    def __init__(self, model="gemini-2.0-flash-lite", delay: float = 0.5):
        """Initialize parser with Gemini API."""
        self.model = genai.GenerativeModel(model)
        self.delay = delay
        self.api_calls = 0
        self.failed_parses = []
        self.last_call_time = 0
        
    def parse_requirements(self, html_content: str, program_name: str = None) -> dict:
        """Parse requirements from HTML using Gemini API."""
        soup = BeautifulSoup(html_content, 'html.parser')
        requirements_div = soup.find('div', {'id': 'requirementstextcontainer'})
        if not requirements_div:
            requirements_div = soup.find('div', {'id': 'right-col'})
        if not requirements_div:
            return {"error": "No requirements content found", "requirements": {}}

        requirements_html = str(requirements_div)
        # Rate limiting
        now = time.time()
        elapsed = now - self.last_call_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_call_time = time.time()
        self.api_calls += 1

        prompt = f"""Parse the following major/minor requirements HTML and return a JSON object with this structure:

{{
    "program_type": "...",
    "degree_type": "...",
    "total_hours": ...,
    "requirements": {{ ... }},
    "footnotes": [...],
    "special_notes": "..."
}}

HTML content:
{requirements_html}

Return ONLY the JSON object."""
        try:
            response = self.model.generate_content(prompt)
            json_text = response.text.strip()
            json_text = re.sub(r'^```json\s*', '', json_text)
            json_text = re.sub(r'\s*```$', '', json_text)
            return json.loads(json_text)
        except Exception as e:
            if program_name:
                self.failed_parses.append((program_name, str(e)))
            return {
                "error": f"Failed to parse: {str(e)}",
                "requirements": {},
                "program_name": program_name
            }

In [4]:
# Cell 4: Scraping Functions
def get_program_links():
    """Get all program links from the programs page."""
    resp = requests.get(PROGRAMS_URL)
    soup = BeautifulSoup(resp.text, 'html.parser')
    sitemap = soup.find('div', {'class': 'az_sitemap'})
    if not sitemap:
        print("Warning: Could not find az_sitemap div")
        return []
    links = []
    for a in sitemap.find_all('a', href=True):
        href, text = a['href'], a.text.strip()
        if 'programs-study' not in href:
            continue
        full = urljoin(BASE_URL, href)
        if '#' not in full:
            full += '#requirementstext'
        links.append({
            'name': text,
            'url': full,
            'program_id': href.split('/')[-2]
        })
    return links

def scrape_program(url, parser: RequirementsParser, info: dict):
    """Scrape a single program's requirements."""
    try:
        resp = requests.get(url); resp.raise_for_status()
        result = parser.parse_requirements(resp.text, info['name'])
        result.update({
            'program_name': info['name'],
            'program_id': info['program_id'],
            'url': url
        })
        return result
    except Exception as e:
        print(f"Error scraping {info['name']}: {e}")
        return {'program_name': info['name'], 'program_id': info['program_id'], 'url': url, 'error': str(e)}

def scrape_all_programs(parser: RequirementsParser, limit=None):
    """Scrape all programs."""
    print("🔍 Finding all program links...")
    links = get_program_links()
    if limit:
        links = links[:limit]
    print(f"🎯 Found {len(links)} programs\n")
    all_progs = []
    start_all = time.time()
    for i, info in enumerate(links, 1):
        print(f"📚 [{i}/{len(links)}] Scraping {info['name']}...")
        start = time.time()
        res = scrape_program(info['url'], parser, info)
        all_progs.append(res)
        status = "✅" if 'error' not in res else "⚠️"
        print(f"{status} Completed in {time.time()-start:.1f}s\n")
    print(f"⏱️ Total time: {(time.time()-start_all)/60:.1f}m")
    return all_progs

def save_to_json(data, filename="unc_programs.json"):
    """Save data to JSON."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"💾 Saved to {filename}")

In [5]:
# Cell 5: Analysis Utilities
def analyze_programs(programs):
    """Print summary of scraped programs."""
    total = len(programs)
    success = len([p for p in programs if 'error' not in p])
    failed = total - success
    print(f"\n📊 Summary: total={total}, success={success}, failed={failed}")
    if failed:
        print("\n⚠️  Failed parses:")
        for p in programs:
            if 'error' in p:
                print(f" - {p['program_name']}: {p['error'][:50]}...")
    majors = [p for p in programs if 'major' in p.get('program_name','').lower()]
    minors = [p for p in programs if 'minor' in p.get('program_name','').lower()]
    print(f"\n📈 Types: majors={len(majors)}, minors={len(minors)}, other={total-len(majors)-len(minors)}")

def find_program(programs, term):
    """Lookup programs by name."""
    term = term.lower()
    matches = [p for p in programs if term in p['program_name'].lower()]
    print(f"\n🔍 Found {len(matches)} programs matching '{term}':")
    for p in matches:
        print(f"📋 {p['program_name']} - URL: {p['url']}")

In [None]:
# Cell 6: Main Execution
parser = RequirementsParser(model="gemini-2.0-flash-lite", delay=2.1)

# Test with first 5 programs
print("🧪 Testing with first 5 programs...\n")
test_progs = scrape_all_programs(parser, limit=5)
save_to_json(test_progs, "../output/unc_programs_test.json")
analyze_programs(test_progs)

# Uncomment to scrape all programs
# print("🚀 Scraping all programs...\n")
# all_progs = scrape_all_programs(parser)
# save_to_json(all_progs, "../output/unc_programs.json")
# analyze_programs(all_progs)

# API usage stats
print(f"\n📊 API calls: {parser.api_calls}, Failed parses: {len(parser.failed_parses)}")
if parser.failed_parses:
    for name, err in parser.failed_parses[:5]:
        print(f" - {name}: {err[:50]}...")

🧪 Testing with first 5 programs...

🔍 Finding all program links...
🎯 Found 5 programs

📚 [1/5] Scraping Aerospace Studies Minor...
✅ Completed in 2.7s

📚 [2/5] Scraping African American and Diaspora Studies Minor...
✅ Completed in 3.9s

📚 [3/5] Scraping African Studies Minor...
✅ Completed in 3.2s

📚 [4/5] Scraping African, African American, and Diaspora Studies Major, B.A....
✅ Completed in 2.2s

📚 [5/5] Scraping American Indian and Indigenous Studies Minor...
✅ Completed in 3.8s

⏱️ Total time: 0.3m
💾 Saved to unc_programs_test.json

📊 Summary: total=5, success=5, failed=0

📈 Types: majors=1, minors=4, other=0

📊 API calls: 5, Failed parses: 0
