In [1]:
from __future__ import annotations
import re, json, time, hashlib, urllib.parse
from collections import deque
from datetime import datetime
from pathlib import Path

import requests
from bs4 import BeautifulSoup
import pandas as pd

try:
    import trafilatura
except Exception:
    trafilatura = None

# ------------ CONFIG ------------
ALLOWED_HOST = "datascience.uchicago.edu"
SEEDS = [
    "https://datascience.uchicago.edu/about/",
    "https://datascience.uchicago.edu/education/",
    "https://datascience.uchicago.edu/research/",
    "https://datascience.uchicago.edu/people/",
    "https://datascience.uchicago.edu/news-events/news/",
    "https://datascience.uchicago.edu/news-events/events/",
    "https://datascience.uchicago.edu/news-events/insights/",
]
MAX_PAGES   = 400
DELAY_SEC   = 1.0      # be polite
TIMEOUT_SEC = 20
UA = "Mozilla/5.0 (RAG-Bot-Scraper; Jupyter)"

# Target CSV for you
TARGET_DIR = Path.home() / "Documents" / "UChicago" / "Qrt 4" / "Gen AI" / "Midterm" / "Text"
TARGET_DIR.mkdir(parents=True, exist_ok=True)
TARGET_CSV = TARGET_DIR / "chunks.csv"

# Also keep local working folder (optional artifacts)
WORK_DIR = Path("data_dsi")
WORK_DIR.mkdir(parents=True, exist_ok=True)

# ------------ HELPERS ------------
def sha256(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def normalize_url(base: str, href: str | None) -> str | None:
    if not href:
        return None
    # drop fragments
    href = href.split("#")[0].strip()
    if not href:
        return None
    absu = urllib.parse.urljoin(base, href)
    u = urllib.parse.urlparse(absu)
    if u.scheme not in ("http", "https"):
        return None
    if u.netloc != ALLOWED_HOST:
        return None
    # strip tracking
    clean_q = urllib.parse.urlencode(
        [(k, v) for k, v in urllib.parse.parse_qsl(u.query, keep_blank_values=False)
         if not k.lower().startswith(("utm_", "fbclid", "gclid"))]
    )
    return urllib.parse.urlunparse(u._replace(query=clean_q))

def looks_like_html(url: str) -> bool:
    path = urllib.parse.urlparse(url).path.lower()
    bad_ext = (".pdf",".xml",".jpg",".jpeg",".png",".gif",".webp",".svg",".mp4",".mov",".zip",".ics",".doc",".ppt",".pptx",".xls",".xlsx")
    return not any(path.endswith(ext) for ext in bad_ext)

def fetch_html(url: str) -> str:
    time.sleep(DELAY_SEC)
    r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT_SEC)
    r.raise_for_status()
    ctype = (r.headers.get("content-type") or "").split(";")[0].strip().lower()
    if "text/html" not in ctype:
        raise ValueError(f"Non-HTML content-type for {url}: {ctype}")
    return r.text

def extract_main_text(url: str, html: str) -> str:
    # prefer trafilatura if present
    if trafilatura is not None:
        try:
            x = trafilatura.extract(html, include_links=False, include_comments=False, url=url, favor_precision=True)
            if x and len(x.strip()) > 200:
                return _clean_ws(x)
        except Exception:
            pass
    # fallback: heuristic soup
    soup = BeautifulSoup(html, "lxml")
    for t in soup.select("script,style,noscript,header,footer,nav,aside"):
        t.decompose()
    main = soup.select_one("main") or soup.select_one("article") or soup.body or soup
    text = "\n".join(
        el.get_text(" ", strip=True)
        for el in main.find_all(["p","li"])
        if el.get_text(strip=True)
    )
    return _clean_ws(text)

def _clean_ws(text: str) -> str:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def get_title_h1_section_type(url: str, html: str) -> tuple[str, str, str, str]:
    """Return (title_tag, h1_text, section, type) based on URL patterns."""
    soup = BeautifulSoup(html, "lxml")
    title_tag = (soup.title.string.strip() if soup.title and soup.title.string else "")
    h1 = ""
    h1_el = soup.find("h1")
    if h1_el:
        h1 = h1_el.get_text(" ", strip=True)

    path = urllib.parse.urlparse(url).path
    segs = [s for s in path.split("/") if s]
    section = segs[0] if segs else "root"

    # type detection
    t = "generic"
    if len(segs) >= 2 and segs[0] == "people":
        t = "person"
    elif len(segs) >= 3 and segs[0] == "news-events" and segs[1] == "news":
        t = "news"
    elif len(segs) >= 3 and segs[0] == "news-events" and segs[1] == "events":
        t = "event"
    elif len(segs) >= 3 and segs[0] == "news-events" and segs[1] == "insights":
        t = "insight"

    return title_tag, h1, section, t

def extract_date(soup: BeautifulSoup) -> str:
    """Try to get a visible or meta date; return ISO string or ''."""
    # time elements
    for t in soup.find_all("time"):
        # prefer datetime attr
        if t.get("datetime"):
            try:
                return datetime.fromisoformat(t["datetime"].replace("Z","+00:00")).isoformat()
            except Exception:
                pass
        txt = t.get_text(" ", strip=True)
        iso = _parse_date_fuzzy(txt)
        if iso:
            return iso

    # common meta patterns
    meta_sel = [
        ('meta[property="article:published_time"]', "content"),
        ('meta[name="date"]', "content"),
        ('meta[itemprop="datePublished"]', "content"),
        ('meta[name="pubdate"]', "content"),
        ('meta[name="publishdate"]', "content"),
    ]
    for sel, key in meta_sel:
        m = soup.select_one(sel)
        if m and m.get(key):
            iso = _parse_date_fuzzy(m[key])
            if iso:
                return iso
    return ""

def _parse_date_fuzzy(s: str) -> str | None:
    s = s.strip()
    # Very small fuzzy parser: try common formats
    for fmt in ("%B %d, %Y", "%b %d, %Y", "%Y-%m-%d", "%m/%d/%Y"):
        try:
            return datetime.strptime(s, fmt).isoformat()
        except Exception:
            pass
    # Quick numeric capture like "2025-10-12T..."
    m = re.search(r"\d{4}-\d{2}-\d{2}", s)
    if m:
        try:
            return datetime.strptime(m.group(0), "%Y-%m-%d").isoformat()
        except Exception:
            pass
    return None

def extract_person_fields(soup: BeautifulSoup) -> dict:
    # Best-effort: name is H1 (already captured). Try role/affiliation line and email.
    role = ""
    email = ""
    # Role: often near h1 in a subtitle/span/p tag
    h1 = soup.find("h1")
    if h1:
        sib_texts = []
        for sib in h1.find_all_next(["p","div","span"], limit=6):
            txt = sib.get_text(" ", strip=True)
            if txt and len(txt) < 200:
                sib_texts.append(txt)
        if sib_texts:
            role = sib_texts[0]
    # Email
    a = soup.find("a", href=re.compile(r"^mailto:", re.I))
    if a:
        email = a.get_text(" ", strip=True) or a.get("href","").replace("mailto:","")
    return {"role": role, "email": email}

def chunk_text(text: str, target: int = 1600, overlap: int = 200) -> list[str]:
    text = re.sub(r"\s+", " ", text).strip()
    n = len(text)
    i, chunks = 0, []
    while i < n:
        j = min(i + target, n)
        window = text[i:j]
        k = window.rfind(". ")
        if k != -1 and (j - (i + k)) < 300:
            j = i + k + 1
        chunks.append(text[i:j].strip())
        i = max(j - overlap, j)
    return [c for c in chunks if c]


In [3]:
def crawl_and_build(max_pages=MAX_PAGES) -> tuple[pd.DataFrame, pd.DataFrame]:
    seen = set()
    q = deque(SEEDS)
    raw_records = []
    chunk_rows = []
    fetched = 0

    while q and fetched < max_pages:
        url = q.popleft()
        if url in seen or not looks_like_html(url):
            continue
        seen.add(url)

        try:
            html = fetch_html(url)
        except Exception as e:
            print(f"skip {url}: {e}")
            continue

        soup = BeautifulSoup(html, "lxml")
        title_tag, h1, section, ptype = get_title_h1_section_type(url, html)
        text = extract_main_text(url, html)

        date_iso = ""
        if ptype in ("news","event","insight"):
            date_iso = extract_date(soup) or ""

        extra = {}
        if ptype == "person":
            extra = extract_person_fields(soup)

        # Save raw record (handy for QA/debug)
        raw_records.append({
            "url": url,
            "title": title_tag,
            "h1": h1,
            "section": section,
            "type": ptype,
            "date": date_iso,
            "text": text,
            "extra": extra,
            "word_count": len(text.split()) if text else 0,
            "fetched_at": datetime.utcnow().isoformat()+"Z"
        })

        # Enqueue internal links
        for a in soup.find_all("a", href=True):
            nxt = normalize_url(url, a["href"])
            if nxt and nxt not in seen and looks_like_html(nxt):
                q.append(nxt)

        # Build chunks
        if text and len(text.split()) >= 50:
            chunks = chunk_text(text, target=1600, overlap=200)
            page_hash = sha256(url)[:10]
            for idx, ch in enumerate(chunks):
                chunk_rows.append({
                    "id": f"{page_hash}-{idx:04d}",
                    "url": url,
                    "title": h1 or title_tag,
                    "section": section,
                    "type": ptype,
                    "date": date_iso,
                    "chunk_id": idx,
                    "text": ch,
                    "sha256": sha256(ch),
                })

        fetched += 1
        if fetched % 10 == 0:
            print(f"…fetched {fetched} pages, chunks so far: {len(chunk_rows)}")

    df_raw = pd.DataFrame(raw_records)
    df_chunks = pd.DataFrame(chunk_rows)
    print(f"Done. Pages fetched: {fetched}, raw pages: {len(df_raw)}, chunks: {len(df_chunks)}")
    return df_raw, df_chunks


In [5]:
df_raw, df_chunks = crawl_and_build()

# Save CSV for RAG
TARGET_DIR.mkdir(parents=True, exist_ok=True)
df_chunks.to_csv(TARGET_CSV, index=False)
print(f"✅ RAG CSV saved to: {TARGET_CSV}")

# (Optional) also save JSONL artifacts for debugging
with (WORK_DIR / "raw_pages.jsonl").open("w", encoding="utf-8") as f:
    for rec in df_raw.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

with (WORK_DIR / "chunks.jsonl").open("w", encoding="utf-8") as f:
    for rec in df_chunks.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Also wrote:", (WORK_DIR / "raw_pages.jsonl").resolve())
print("Also wrote:", (WORK_DIR / "chunks.jsonl").resolve())

display(df_chunks.head(5))
print("Total chunks:", len(df_chunks))


  "fetched_at": datetime.utcnow().isoformat()+"Z"


…fetched 10 pages, chunks so far: 33
…fetched 20 pages, chunks so far: 65
…fetched 30 pages, chunks so far: 113
…fetched 40 pages, chunks so far: 153
…fetched 50 pages, chunks so far: 191
…fetched 60 pages, chunks so far: 209
…fetched 70 pages, chunks so far: 229
…fetched 80 pages, chunks so far: 251
…fetched 90 pages, chunks so far: 271
…fetched 100 pages, chunks so far: 292
skip https://datascience.uchicago.edu/news-events/past-events: 404 Client Error: Not Found for url: https://datascience.uchicago.edu/news-events/past-events
…fetched 110 pages, chunks so far: 317
skip https://datascience.uchicago.edu/outreach/industry-affiliates/: 404 Client Error: Not Found for url: https://datascience.uchicago.edu/outreach/industry-affiliates/
…fetched 120 pages, chunks so far: 345
…fetched 130 pages, chunks so far: 377
…fetched 140 pages, chunks so far: 424
…fetched 150 pages, chunks so far: 463
…fetched 160 pages, chunks so far: 506
…fetched 170 pages, chunks so far: 536
…fetched 180 pages, ch

Unnamed: 0,id,url,title,section,type,date,chunk_id,text,sha256
0,3e4c2d399c-0000,https://datascience.uchicago.edu/about/,About - DSI,about,generic,,0,The UChicago Data Science Institute executes t...,93a410375295f383d3323f1bfcfedee3f6c2c05f0b4bd5...
1,3e4c2d399c-0001,https://datascience.uchicago.edu/about/,About - DSI,about,generic,,1,The Data Science Institute is located on the U...,5af0feb9df7cd0add84cd7ea4f5a0f9826fb553d1031ec...
2,2578caec03-0000,https://datascience.uchicago.edu/education/,Education - DSI,education,generic,,0,"Building the foundations of data science, cons...",80f46da1c5f116bede88ce148e831a9aaf0ea2e2b331e7...
3,2578caec03-0001,https://datascience.uchicago.edu/education/,Education - DSI,education,generic,,1,Reimagining data science education and broaden...,2ff9136e12a47fcf063b2fbe2d9262ddefabc73d378375...
4,473e55df7b-0000,https://datascience.uchicago.edu,Data Science Institute,root,generic,,0,We have several job opportunities currently op...,d84983247a1bfbc1f4700d214d0e160a7b56d18ce856fc...


Total chunks: 1461
