In [14]:
%%capture
!pip install faiss-cpu langchain langchain-community langchain-core langchain-huggingface pacmap


⏱️ Execution time: 15.22s


# Applied ML Final Project Scratch

Hoping to work through some of the requirements in a scratch notebook to be incorporated into some final project, whatever it ends up looking like.

In [19]:
# PARAMETERS
# Static or at least relatively static

# SETUP
COLAB_ROOT    = "/content"                             # Top-level dir for colab files
REPO_PATH     = "cpaynerogers/COMS4995---RAG-Chatbot"  # Repo path
REPO_BRANCH   = "colin"                                # Development branch
SOURCE_PATH   = "Code/src"                             # Within the repo, the root of the source code dir
DOCUMENT_PATH = "appliedml/FINAL/documents"            # Relative to user's drive, path to documents
RAW_DATA_PATH = "appliedml/FINAL/raw"

# RAG
TOP_K           = 5
EMBEDDING_MODEL = "thenlper/gte-small"
RANDOM_STATE    = 42

In [3]:
# TOGGLES
# Change which part(s) of the notebook to run

DEBUG        = 0  # Mostly extra printing
SCRAPE_DOC   = 0  # Get subjects, instructors from Columbia directory of classes (DOC)
SAVE_COURSES = 0  # Save scraped DOC courses as docs
SCRAPE_CULPA = 0  # Get professor ratings from Culpa

## Repo Setup

Connect to Google Drive. Clone the project repository for shared code snippets.

In [4]:
import os
import sys

# Repo config
REPO_URL = f"https://github.com/{REPO_PATH}"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])

# Force clone the repository
!rm -rf {REPO_ROOT}
os.chdir(COLAB_ROOT)
!git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git remote -v
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, SOURCE_PATH))
os.chdir(COLAB_ROOT)

Cloning into 'COMS4995---RAG-Chatbot'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 21 (delta 0), reused 17 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (21/21), 13.33 KiB | 13.33 MiB/s, done.
Already up to date.
origin	https://github.com/cpaynerogers/COMS4995---RAG-Chatbot (fetch)
origin	https://github.com/cpaynerogers/COMS4995---RAG-Chatbot (push)
Branch 'colin' set up to track remote branch 'colin' from 'origin'.
Switched to a new branch 'colin'


## Notebook Setup

Add helper functions

In [5]:
from helpers import add_cell_timer

add_cell_timer()

## Drive Setup

Mount the user's Google Drive. List the documents available for RAG

In [6]:
from pprint import pprint

from google.colab import drive

# NOTE: this is custom to Colin's drive setup
DRIVE_MOUNT = os.path.join(COLAB_ROOT, "drive")
drive.mount(DRIVE_MOUNT)

DOCUMENT_DIR = os.path.join(DRIVE_MOUNT, "MyDrive", DOCUMENT_PATH)
if DEBUG:
    print("\nDOCUMENTS FOR RAG:")
    pprint(os.listdir(DOCUMENT_DIR))

RAW_DATA_DIR = os.path.join(DRIVE_MOUNT, "MyDrive", RAW_DATA_PATH)
if DEBUG:
    print("\nRAW DATA:")
    pprint(os.listdir(RAW_DATA_DIR))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

⏱️ Execution time: 0.66s


## Scrape for Professors

If toggled, look at the directory of classes <https://doc.sis.columbia.edu/> for a supported semester, get the professors teaching, and the subjects available. It seems that this can't be done programmatically (is blocked by CloudFlare), so, this section assumes the data is available in the raw data folder. Get the data by:

* Navigating to <https://doc.search.columbia.edu/>
* Choosing the semester-of-interest
* Clicking `Search`
* Opening Chrome dev tools
* Right clicking the top-most `<html>` and copying `outerHTML`
* Saving it as a file


For each hour, e.g. visiting:

* <https://doc.search.columbia.edu/classes/+?semes=20261&subterm=all&hour=8>
* <https://doc.search.columbia.edu/classes/+?semes=20261&subterm=all&hour=9>
* ...
* <https://doc.search.columbia.edu/classes/+?semes=20261&subterm=all&hour=20>

In [7]:
import warnings

from utils import ColumbiaCourseData


if SCRAPE_DOC:
    c = ColumbiaCourseData(
        courses_files=[
            os.path.join(RAW_DATA_DIR, f"spring_2026_hour_{str(hour).zfill(2)}.html")
            for hour in range(8, 21)
        ],
        debug=DEBUG,
    )
    subjects = c.subjects
    instructors = c.instructors
else:
    warnings.warn(
        "Need to save and then load subjects, instructors, especially "
        "if this takes a long time (it doesn't)"
    )



In [8]:
import os
import re
from dataclasses import replace

from helpers import print_wrapped


def slug(title: str) -> str:
    return re.sub(r'[^A-Za-z0-9]+', "_", title).strip("_")


if SAVE_COURSES:
    # First, get a unique set
    seen = set()
    courses = [
        course
        for course in c.courses
        if not (
            (course.title, course.description) in seen or
            seen.add((course.title, course.description))
        )
    ]

    # Define the file names
    courses = [
        replace(course, file_name=f"{slug(course.title.lower())}.txt")
        for course in courses
    ]

    # Format and save each file
    for course in courses:
        file_content = (
            f"TITLE: {course.title}\n"
            f"SUBJECT: {course.subject}\n"
            f"INSTRUCTOR: {course.instructor}\n"
            f"DESCRIPTION: {course.description}"
        )
        file_path = os.path.join(DOCUMENT_DIR, "courses", course.file_name)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(file_content)

In [9]:
from helpers import print_wrapped

if DEBUG and SCRAPE_DOC:
    print("SUBJECTS: ")
    pprint(c.subjects)

    print("INSTRUCTORS: ")
    print_wrapped(c.instructors)

## CULPA Professor Search

Given the list of professors, search for their ratings from CULPA.

In [10]:
import csv

from utils import get_professor_ids, get_professor_rating


if SCRAPE_CULPA:
    # Search 'instructors[2::]'
    # 'instructors[0]' is '', 'instructors[1]' is '. Faculty'
    # May need to refind DOC search
    professor_ratings = []
    professor_count = 0
    for name in instructors[2::]:

        # Get the professor ID(s)
        ids = get_professor_ids(name)
        if ids is None:
            continue

        # Get their rating
        # Some professors have a few pages, all populated with data
        # TODO: are the multiple pages for the same professor, or
        # different professors with the same name?
        count = 0
        rating = 0
        total_rating = 0
        for id in ids:
            rating = get_professor_rating(id)
            if rating is not None:
                count += 1
                total_rating += rating
        if count:
            rating = total_rating / count

        # To save to CSV
        if rating:
            professor_count += 1
            professor_ratings.append((name, rating))


    with open("culpa_ratings.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["name", "rating"])
        writer.writerows(professor_ratings)

## Minimal RAG?

Trying to follow <https://huggingface.co/learn/cookbook/en/advanced_rag> and get a RAG working with the docs we have

### Create a knowledge base

In [11]:
import os
from pathlib import Path

from langchain_core.documents import Document as LangchainDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

DEBUG = True

# Embedding and chunking setup
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
max_sequence_length = embedding_model.max_seq_length
chunk_size = int(max_sequence_length * 0.9)
chunk_overlap = int(chunk_size * 0.1)
if DEBUG:
    print(f"Max sequence length: {max_sequence_length}")
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    add_start_index=True,
    strip_whitespace=True,
)

# Gather documents.
# Can speed up by only tokenizing once, at risk of splitting
# more documents due to overlap
base_path = Path(DOCUMENT_DIR)
knowledge_base = []
for dirpath, _, filenames in os.walk(DOCUMENT_DIR):
    for file_name in filenames:
        doc_path = os.path.join(dirpath, file_name)
        with open(doc_path, "r", encoding="utf-8") as f:
            text = f.read()
        relative_doc_path = Path(doc_path).relative_to(base_path)

        metadata = {"source": str(relative_doc_path)}
        doc_tokens = tokenizer.encode(text)
        doc = LangchainDocument(
            page_content=text,
            metadata=metadata,
        )
        if len(doc_tokens) < max_sequence_length:
            knowledge_base.append(doc)
        else:
            doc_chunks = text_splitter.split_documents([doc])
            for chunk in doc_chunks:
                # Preserve metadata
                chunk.metadata = {
                    **chunk.metadata,
                    **metadata,
                }
            knowledge_base.extend(doc_chunks)
if DEBUG:
    print(f"Base knowledge base: {len(knowledge_base)}")

# Check that documents won't be truncated
for doc in tqdm(knowledge_base):
    tokenized_doc = tokenizer.encode(doc.page_content)
    if len(tokenized_doc) > max_sequence_length:
        raise ValueError("Document too large")

Max sequence length: 512
Base knowledge base: 880


  0%|          | 0/880 [00:00<?, ?it/s]


⏱️ Execution time: 25.93s


### Create a vector database

Using hugging face embeddings and `faiss`

In [12]:
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    knowledge_base,
    embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)


⏱️ Execution time: 16.38s


#### Plot

The vector embeddings

In [21]:
import pacmap
import numpy as np
import pandas as pd
import plotly.express as px

# List of user queries
user_queries = [
    "Which computer science course should I take after Advance Programming?",
    "How can I prepare for a machine learning course?",
    "Are there any spring courses on Derrida?"
]

# Embed all user queries
query_vectors = [embedding_model.embed_query(q) for q in user_queries]

embedding_projector = pacmap.PaCMAP(
    n_components=2,
    n_neighbors=None,
    MN_ratio=0.5,
    FP_ratio=2.0,
    random_state=RANDOM_STATE,
)

# Document embeddings + query embeddings
doc_embeddings = [
    list(KNOWLEDGE_VECTOR_DATABASE.index.reconstruct_n(idx, 1)[0])
    for idx in range(len(knowledge_base))
]
embeddings_2d = doc_embeddings + query_vectors

documents_projected = embedding_projector.fit_transform(
    np.array(embeddings_2d), init="pca"
)

n_docs = len(knowledge_base)
n_queries = len(user_queries)

# Data for document points
rows = [
    {
        "x": documents_projected[i, 0],
        "y": documents_projected[i, 1],
        "source": knowledge_base[i].metadata["source"],
        "extract": knowledge_base[i].page_content[:100] + "...",
        "symbol": "circle",
        "size_col": 4,
    }
    for i in range(n_docs)
]

# Data for query points
rows += [
    {
        "x": documents_projected[n_docs + j, 0],
        "y": documents_projected[n_docs + j, 1],
        "source": "User query",  # all grouped under one legend entry
        "extract": user_queries[j],
        "symbol": "star",
        "size_col": 100,
    }
    for j in range(n_queries)
]

df = pd.DataFrame.from_dict(rows)

fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data=["extract"],
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(
    legend_title_text="<b>Chunk source</b>",
    title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
)
fig.show()




⏱️ Execution time: 39.33s
