### Midterm Project
### Chatbot for University of Chicago Applied Data Science Program Questions

In [17]:
!pip install -U langchain


Collecting langchain
  Downloading langchain-1.0.4-py3-none-any.whl.metadata (4.9 kB)
Collecting langgraph<1.1.0,>=1.0.2 (from langchain)
  Downloading langgraph-1.0.2-py3-none-any.whl.metadata (7.4 kB)
Collecting langgraph-prebuilt<1.1.0,>=1.0.2 (from langgraph<1.1.0,>=1.0.2->langchain)
  Downloading langgraph_prebuilt-1.0.2-py3-none-any.whl.metadata (5.0 kB)
Downloading langchain-1.0.4-py3-none-any.whl (93 kB)
Downloading langgraph-1.0.2-py3-none-any.whl (156 kB)
Downloading langgraph_prebuilt-1.0.2-py3-none-any.whl (34 kB)
Installing collected packages: langgraph-prebuilt, langgraph, langchain
[2K  Attempting uninstall: langgraph-prebuilt
[2K    Found existing installation: langgraph-prebuilt 1.0.1
[2K    Uninstalling langgraph-prebuilt-1.0.1:
[2K      Successfully uninstalled langgraph-prebuilt-1.0.1
[2K  Attempting uninstall: langgraph
[2K    Found existing installation: langgraph 1.0.1
[2K    Uninstalling langgraph-1.0.1:
[2K      Successfully uninstalled langgraph-1.0.1


In [None]:
# imports & configs
import os, re, time, glob, uuid, urllib.parse, json, textwrap, requests
from bs4 import BeautifulSoup

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# prompting for openAI API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "enter your key here")  # set in your env before running

# creating folders for raw links, text from links, and the FAISS indexes
DATA_DIR = "msads_data"
HTML_DIR = f"{DATA_DIR}/html"
TEXT_DIR = f"{DATA_DIR}/text"
INDEX_DIR = f"{DATA_DIR}/index"
os.makedirs(HTML_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)

# defaults for the model
EMBED_MODEL = "text-embedding-3-small"   
CHAT_MODEL  = "gpt-4o-mini"              
K_RETRIEVE  = 5
CHUNK_SIZE  = 1000
CHUNK_OVERLAP = 200


In [25]:
# download MS-ADS pages

SEED = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"

TARGET_URLS = [
    SEED,
    "https://datascience.uchicago.edu/education/masters-programs/in-person-program/",
    "https://datascience.uchicago.edu/education/masters-programs/online-program/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/capstone-projects/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/course-progressions/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/how-to-apply/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/events-deadlines/",
    # "https://datascience.uchicago.edu/education/ms-in-applied-data-science/tuition-fees-aid/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/our-students/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/instructors-staff/",
    "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/faqs/",
]

def download_all(urls=TARGET_URLS, out_dir=HTML_DIR, throttle=0.3):
    """
    Downloads each page in TARGET_URLS and saves raw HTML into HTML_DIR.
    """
    headers = {"User-Agent": "UChicago-MSADS-RAG (class project)"}
    os.makedirs(out_dir, exist_ok=True)

    for url in TARGET_URLS:
        try:
            resp = requests.get(url, headers=headers, timeout=20)
            resp.raise_for_status()
            fname = urllib.parse.quote(url, safe="") + ".html"
            with open(os.path.join(out_dir, fname), "w", encoding="utf-8") as f:
                f.write(resp.text)
            print(f"Saved: {url}")
        except Exception as e:
            print(f"⚠️  Skipped {url}: {e}")

    print(f"{len(urls)} pages saved in {out_dir}")

# Run the fixed-list downloader
download_all()


Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/
Saved: https://datascience.uchicago.edu/education/masters-programs/in-person-program/
Saved: https://datascience.uchicago.edu/education/masters-programs/online-program/
Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/capstone-projects/
Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/course-progressions/
Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/how-to-apply/
Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/events-deadlines/
Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/our-students/
Saved: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/instructors-staff/
Saved: https://datascience.uchicago.edu/education/masters-programs/

In [27]:
# clean & normalize the text
# remove parts of the HMTL page before extracting text
DROP_SELECTORS = ["nav", "footer", ".site-footer", ".menu", ".breadcrumbs", "script", "style"]

# this helper function finds things like navigation bars, footers, menus, etc.
# and removes them from the HTML tree
def clean_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for sel in DROP_SELECTORS:
        for tag in soup.select(sel): 
            tag.decompose()
    text = soup.get_text("\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n\n", text).strip()
    return text

# looks in HTML_DIR folder at each HTML page downloaded, loads the file,
# passes it to clean_to_text, writes a .txt file into TEXT_DIR folder
def run_clean(in_dir=HTML_DIR, out_dir=TEXT_DIR):
    for fn in glob.glob(f"{in_dir}/*.html"):
        with open(fn, "r", encoding="utf-8") as f:
            html = f.read()
        txt = clean_to_text(html)
        base = os.path.basename(fn).replace(".html", "")
        with open(f"{out_dir}/{base}.txt", "w", encoding="utf-8") as f:
            f.write(txt)
    print("Cleaned:", len(glob.glob(f'{out_dir}/*.txt')), "files")

run_clean()

Cleaned: 10 files


In [29]:
# build Chroma
from pathlib import Path
from langchain_community.vectorstores import Chroma

def load_docs(txt_dir=TEXT_DIR):
    docs = []
    for path in glob.glob(f"{txt_dir}/*.txt"):
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        # recover URL from filename
        url = urllib.parse.unquote(os.path.basename(path).replace(".txt","").replace("%2F","/"))
        # modest section labeling improves retrieval score
        docs.append({"text": text, "source": url})
    return docs

def chunk_docs(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, separators=["\n\n", "\n", ". ", " "]
    )
    texts, metas = [], []
    for d in docs:
        parts = splitter.split_text(d["text"])
        for i, p in enumerate(parts):
            texts.append(p)
            metas.append({"source": d["source"], "chunk_id": i})
    return texts, metas


PERSIST_DIR = f"{INDEX_DIR}/chroma"
Path(PERSIST_DIR).mkdir(parents=True, exist_ok=True)

def build_chroma():
    docs = load_docs()
    texts, metas = chunk_docs(docs)
    if not texts:
        raise RuntimeError("No chunks created — rerun cleaning or lower filters.")
    embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
    vs = Chroma.from_texts(
        texts=texts,
        embedding=embeddings,
        metadatas=metas,
        persist_directory=PERSIST_DIR,
        collection_name="msads_msads"
    )
    vs.persist()
    print(f"Built Chroma index with {len(texts)} chunks at {PERSIST_DIR}")

build_chroma()

Built Chroma index with 307 chunks at msads_data/index/chroma


  vs.persist()


In [30]:
# Personally Identifiable Information scrubbers (saftey add-on in the instructions)
# we are removing email addresses (this gibberish looking pattern is coding for emails like mo.abdelhamid@uchicago.edu
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
# and removing phone numbers (this pattern matches phone numbers in many formats)
PHONE_RE = re.compile(r"\+?\d[\d\-\(\) ]{7,}\d")

def redact_pii(s: str) -> str:
    s = EMAIL_RE.sub("[REDACTED_EMAIL]", s)
    s = PHONE_RE.sub("[REDACTED_PHONE]", s)
    return s

In [31]:
# Grounding prompt we have used before
SYSTEM_PROMPT = """You are a helpful assistant for the University of Chicago MS in Applied Data Science.
Answer ONLY using the provided context. If the answer is not in the context, say
'I don’t know based on the official MS-ADS pages I have.' Keep answers concise,
quote or paraphrase key lines, and include source URLs."""

HUMAN_PROMPT = """Question:
{question}

Context:
{context}

Rules:
- Do not invent program details or deadlines.
- Prefer bullet points for lists.
- End with 'Sources:' and list the source URLs you used.
"""

rag_prompt = ChatPromptTemplate.from_messages(
    [("system", SYSTEM_PROMPT), ("human", HUMAN_PROMPT)]
)

In [35]:
# retriever + multi-query expansion
from langchain_core.documents import Document

# loading embeddings and vector store
# chose OpenAIEmbeddings model and Chroma vector database
emb = OpenAIEmbeddings(model=EMBED_MODEL)
vs = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=emb,
    collection_name="msads_msads"
)
retriever = vs.as_retriever(search_kwargs={"k": K_RETRIEVE})

# this is a multi-query element. if someone says "tell me about the capstone"
# the LLM will decompose it into three related queries before responding
# temperature of 0 makes it deterministic, giving same responses every time
llm_qt = ChatOpenAI(model=CHAT_MODEL, temperature=0)

# this prompts GPT-4 to use the original question and turn it into 3 related queries
QT_PROMPT = ChatPromptTemplate.from_messages([
    ("system", "You rewrite questions into diverse, relevant search queries for semantic retrieval."),
    ("human", "Rewrite the user question into 3 diverse, concise search queries focused on MS-ADS site.\n\nQuestion: {q}")
])

# user asks a question, q, it's turned into plain text
# 'lines' removes bullets or blank lines from the model's ouput
def multi_query(q: str):
    out = llm_qt.invoke(QT_PROMPT.format_messages(q=q))
    lines = [l.strip("-• ").strip() for l in out.content.split("\n") if l.strip()]
    # if the model doesn't generate two distinct queries, it just rewrites the original query three times
    if len(lines) < 2:
        lines = [q, f"What is {q} (MS-ADS)?", f"Details about {q} on MS-ADS"]
        # multi_query() takes one user question and gives back three phrased variations of it
    return lines[:3]

# this retrieves documents for all query variants

def retrieve_with_qt(q: str, k=K_RETRIEVE):
    queries = multi_query(q)
    # blank array for relevant queries
    hits = []
    # mq represents each rewritten query
    for mq in queries:
        # retriever fetches relevant chunks 'docs' from FAISS index
        docs = retriever.invoke(mq) or []
        # relevant chunks added to hits
        hits.extend(docs)
    # duplicates removed
    seen = set(); uniq = []
    for h in hits:
        key = (h.metadata.get("source",""), h.metadata.get("chunk_id",-1))
        if key not in seen:
            seen.add(key); uniq.append(h)
        if len(uniq) >= k:
            break
    return uniq


  vs = Chroma(


In [37]:
# This function takes a list of retrieved document objects and converts them into
# one long formatted text string that the language model can read easily.
# Each document is labeled with its source URL and separated by "---".
# The redact_pii function removes any personal information such as emails or phone numbers.
def format_docs(docs):
    blocks = []
    for d in docs:
        src = d.metadata.get("source","")
        blocks.append(f"[Source: {src}]\n{redact_pii(d.page_content)}")
    return "\n\n---\n\n".join(blocks)

# This line initializes the GPT-4 model that will generate the final answer text
llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)

# The build_chain function constructs the RAG pipeline.
# It defines how a user question is processed from start to finish: 
# retrieve documents, prepare context, prompt the model, and parse the result
def build_chain():
    # The inner function _retrieve handles the retrieval step for one user input.
    # It cleans the question of any personal data, retrieves the top matching documents,
    # and returns a dictionary that includes the question, the formatted context text, and the documents themselves
    def _retrieve(inputs):
        q = redact_pii(inputs["question"]) # no PII
        docs = retrieve_with_qt(q, k=K_RETRIEVE) # multi query retrieval for relevant docs
        return {"question": q, "context": format_docs(docs), "docs": docs} #package docs into dictionary

    # RunnablePassthrough passes the input question through, _retrieve fetches and formats context for that question,
    # rag_prompt builds the final prompt that includes both question and context, llm generates the answer from the prompt,
    # StrOutputParser extracts the text content of the model’s response
    chain = (
        RunnablePassthrough.assign(**{"question": lambda x: x["question"]})
        | _retrieve
        | rag_prompt
        | llm
        | StrOutputParser()
    )
    return chain

rag_chain = build_chain()

In [39]:
# This function answers a user's question using the pipleline checks for missing sources or 
# empty results to prevent hallucinated answers
def answer(question: str):
    # Retrieve relevant documents based on the question
    docs = retrieve_with_qt(question, k=K_RETRIEVE)
    # Format the retrieved documents into readable context for the model
    ctx = format_docs(docs)
    # Build the final prompt by inserting the question and context into the template
    msg = rag_prompt.format(question=redact_pii(question), context=ctx)
    # Send the prompt to the language model and get the generated text
    out = llm.invoke(msg).content

    #  # If the model forgets to include a "Sources:" section, add one manually
    urls = list({d.metadata.get("source","") for d in docs if d.metadata.get("source")})
    urls = [u if u.startswith("http") else urllib.parse.unquote(u) for u in urls]
    if "Sources:" not in out:
        out += "\n\nSources:\n" + "\n".join(urls)
    elif out.strip().endswith("Sources:"):
        out += "\n" + "\n".join(urls)

    # hallucination check: if no documents were retrieved, return a safe answer
    if not docs:
        out = "I don’t know based on the official MS-ADS pages I have.\n\nSources:\n( none )"
    # Return the model's answer text and the retrieved document list
    return out, docs


In [41]:
# test cell for RAG pipeline

# Define a question to test
test_question = "What are the core courses in the MS in Applied Data Science program?"

# Run it through your answer() helper
try:
    answer_text, retrieved_docs = answer(test_question)
except Exception as e:
    raise RuntimeError(f"RAG pipeline error: {e}")

# Show the answer
print("QUESTION:\n", test_question, "\n")
print("ANSWER:\n", answer_text, "\n")

# 4) Summarize retrieved sources
unique_sources = []
seen = set()
for d in retrieved_docs:
    src = d.metadata.get("source", "")
    if src and src not in seen:
        seen.add(src)
        unique_sources.append(src)

print("SOURCES USED:")
for s in unique_sources:
    print(" -", s)

QUESTION:
 What are the core courses in the MS in Applied Data Science program? 

ANSWER:
 The core courses in the MS in Applied Data Science program include:

- A total of **6 core courses** that build theoretical knowledge and practical application in data science.
- These courses are designed to help students examine real-world business problems.

Additionally, there is a **Career Seminar** (noncredit, required) that supports the development of professional skills.

Sources:
- https://datascience.uchicago.edu/education/masters-programs/online-program/
- https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/course-progressions/ 

SOURCES USED:
 - https://datascience.uchicago.edu/education/masters-programs/online-program/
 - https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/course-progressions/
 - https://datascience.uchicago.edu/education/masters-programs/in-person-program/


In [45]:
# test cell for RAG pipeline

# Define a question to test
test_question = "who are the faculties?"

# Run it through your answer() helper
try:
    answer_text, retrieved_docs = answer(test_question)
except Exception as e:
    raise RuntimeError(f"RAG pipeline error: {e}")

# Show the answer
print("QUESTION:\n", test_question, "\n")
print("ANSWER:\n", answer_text, "\n")

# 4) Summarize retrieved sources
unique_sources = []
seen = set()
for d in retrieved_docs:
    src = d.metadata.get("source", "")
    if src and src not in seen:
        seen.add(src)
        unique_sources.append(src)

print("SOURCES USED:")
for s in unique_sources:
    print(" -", s)

QUESTION:
 who are the faculties? 

ANSWER:
 The faculty and staff for the MS in Applied Data Science program at the University of Chicago include:

- **Greg Green**: Senior Instructional Professor; Senior Director of the DSI Polsky Transform Initiative; Senior Director of DSI Executive and Professional Education
- **Arnab Bose, PhD**: Faculty member (specific role not detailed)
- **Kristin McCann, PhD**: Chief of Staff, Executive Director, MS in Applied Data Science
- **Alison Ossyra**: Director, External Partnerships and Career Services, MS in Applied Data Science
- **DuJuan Smith, PhD**: Director, Student Affairs, MS in Applied Data Science
- **Brody Tate, EdD**: Program Manager, Online, MS in Applied Data Science
- **Daniel Truesdale, MPP**: Director, Enrollment Management and Analytics, MS in Applied Data Science
- **Taylor Wallace, MEd**: Graduate Academic Advisor, MS in Applied Data Science
- **Samantha Widemon, MNA**: Graduate Academic Advisor, MS in Applied Data Science
- **Je

In [73]:
# test cell for RAG pipeline

# Define a question to test
test_question = "Can you provide information about the capstone project?"

# Run it through your answer() helper
try:
    answer_text, retrieved_docs = answer(test_question)
except Exception as e:
    raise RuntimeError(f"RAG pipeline error: {e}")

# Show the answer
print("QUESTION:\n", test_question, "\n")
print("ANSWER:\n", answer_text, "\n")

# 4) Summarize retrieved sources
unique_sources = []
seen = set()
for d in retrieved_docs:
    src = d.metadata.get("source", "")
    if src and src not in seen:
        seen.add(src)
        unique_sources.append(src)

print("SOURCES USED:")
for s in unique_sources:
    print(" -", s)

QUESTION:
 Can you provide information about the capstone project? 

ANSWER:
 - The Capstone Project is the culminating experience of the MS in Applied Data Science program.
- Students will work on real business problems, gaining valuable insights using authentic data.
- Collaboration with project sponsors is key to developing data science solutions that address organizational challenges.
- There is an option to join a research-focused team, leveraging the university's research portfolio.
- The Capstone experience enhances collaboration skills, provides mentoring, and offers networking opportunities across various sectors, including finance and entertainment.

Sources:
- https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/capstone-projects/
- https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/ 

SOURCES USED:
 - https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/capstone-projects/

In [90]:
# test cell for RAG pipeline

# Define a question to test
test_question = "Are there opportunities to work with real-world datasets or industry partners?"

# Run it through your answer() helper
try:
    answer_text, retrieved_docs = answer(test_question)
except Exception as e:
    raise RuntimeError(f"RAG pipeline error: {e}")

# Show the answer
print("QUESTION:\n", test_question, "\n")
print("ANSWER:\n", answer_text, "\n")

# 4) Summarize retrieved sources
unique_sources = []
seen = set()
for d in retrieved_docs:
    src = d.metadata.get("source", "")
    if src and src not in seen:
        seen.add(src)
        unique_sources.append(src)

print("SOURCES USED:")
for s in unique_sources:
    print(" -", s)

QUESTION:
 Are there opportunities to work with real-world datasets or industry partners? 

ANSWER:
 Yes, there are opportunities to work with real-world datasets and industry partners through the Capstone Experience in the MS in Applied Data Science program. This experience allows students to help top companies across multiple sectors solve real business problems. 

Sources:
- https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/
- https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/events-deadlines/ 

SOURCES USED:
 - https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/events-deadlines/
 - https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/
 - https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/faqs/
