# Import required packages

In [1]:
#load packages
import pandas as pd
from functools import partial
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [3]:
import json
import tiktoken
import nltk

In [5]:
import chromadb
from sentence_transformers import SentenceTransformer

In [6]:
from transformers import AutoProcessor, AutoTokenizer, AutoModelForImageTextToText
import torch

In [7]:
from huggingface_hub import login
from dotenv import load_dotenv

# Load in Documents

In this section we will load in the scraped documents and clean them.

In [None]:
#read in csv for majors
majors = pd.read_csv("uva_majors.csv")

#remove non major data
majors = majors.iloc[8:-1]


In [None]:
# remove the extra text from the text column
test = 'Info For Students Alumni Military Affiliated Students Faculty & Staff Search Search Submit Search Close search Info For Students Alumni Military Affiliated Students Faculty & Staff Calendars Academic Calendars Exam Schedules Student Records Diplomas Transcripts FERPA Programs Calendar Schools University Registrar Carruthers Hall, 1001 N. Emmet St. P.O. Box 400203 Charlottesville, VA 22904-4203 Staff Directory Contact Info Phone: (434) 924-4122 Fax: (434) 924-4156 Email: [email protected] Hours Of Operation M-F: 10am - noon and 1pm - 4pm Â© 2024 By the Rector and Visitors of the University of Virginia Legal Privacy Report a Barrier Share Your Feedback University of Virginia Mar 31, 2025 Undergraduate Record 2024-2025 Select a Catalog Undergraduate Record 2024-2025 Graduate Record 2024-2025 Global Search Catalog Search Choose Search Location Select an option Courses Programs Schools/Colleges & Departments Policies and Other Non-Academic Content Entire Catalog Search Keyword Field Whole Word/Phrase Advanced Search Catalog Navigation Catalog Home Academic Calendar Admission Schools Programs, Degrees & Course Info Student Resources ROTC University Regulations About UVA Archived Records HELP Undergraduate Record 2024-2025'
text_to_remove_head = 'Info For Students Alumni Military Affiliated Students Faculty & Staff Search Search Submit Search Close search Info For Students Alumni Military Affiliated Students Faculty & Staff Calendars Academic Calendars Exam Schedules Student Records Diplomas Transcripts FERPA Programs Calendar Schools University Registrar Carruthers Hall, 1001 N. Emmet St. P.O. Box 400203 Charlottesville, VA 22904-4203 Staff Directory Contact Info Phone: (434) 924-4122 Fax: (434) 924-4156 Email: [email protected] Hours Of Operation M-F: 10am - noon and 1pm - 4pm Â© 2024 By the Rector and Visitors of the University of Virginia Legal Privacy Report a Barrier Share Your Feedback University of Virginia Mar 12, 2025 Undergraduate Record 2024-2025 Select a Catalog Undergraduate Record 2024-2025 Graduate Record 2024-2025 Global Search Catalog Search Choose Search Location Select an option Courses Programs Schools/Colleges & Departments Policies and Other Non-Academic Content Entire Catalog Search Keyword Field Whole Word/Phrase Advanced Search Catalog Navigation Catalog Home Academic Calendar Admission Schools Programs, Degrees & Course Info Student Resources ROTC University Regulations About UVA Archived Records HELP Undergraduate Record 2024-2025' 
text_to_remove_tail = 'Back to Top | Print-Friendly Page (opens a new window) All catalogs © 2025 University of Virginia. Powered by Modern Campus Catalog™ . .'
#function to remove the text
def remove_text(text):
    if text.startswith(text_to_remove_head):
        text = text[len(text_to_remove_head):]
    if text.startswith(test):
        text = text[len(test):]
    if text.endswith(text_to_remove_tail):
        text = text[:-len(text_to_remove_tail)]
    return text.strip()

#apply the function to the text column
majors['Text'] = majors['Text'].apply(remove_text)

#since some of the majors have extra text that is a bit different, we need to remove it
majors[majors['Major'] == 'Psychology, B.A.']['Text']

majors[50:90]

In [None]:
# remove extra text from the majors column where majors includes "interdisciplinary"
text_to_remove = 'Interdisciplinary - '
text_to_remove2 = 'Interdisciplinary Major - '
def remove_text_majors(text):
    if text.startswith(text_to_remove):
        text = text[len(text_to_remove):]
    if text.startswith(text_to_remove2):
        text = text[len(text_to_remove2):]
    return text.strip()

#apply the function to the majors column
majors['Major'] = majors['Major'].apply(remove_text_majors)

In [None]:
majors[20:40]

In [None]:
#read in schools data
schools = pd.read_csv("uva_schools_academic_rules.csv")
schools

In [None]:
# remove the extra text from the text column
extra_begining_text = 'Info For Students Alumni Military Affiliated Students Faculty & Staff Search Search Submit Search Close search Info For Students Alumni Military Affiliated Students Faculty & Staff Calendars Academic Calendars Exam Schedules Student Records Diplomas Transcripts FERPA Programs Calendar Schools University Registrar Carruthers Hall, 1001 N. Emmet St. P.O. Box 400203 Charlottesville, VA 22904-4203 Staff Directory Contact Info Phone: (434) 924-4122 Fax: (434) 924-4156 Email: [email protected] Hours Of Operation M-F: 10am - noon and 1pm - 4pm Â© 2024 By the Rector and Visitors of the University of Virginia Legal Privacy Report a Barrier Share Your Feedback University of Virginia Mar 31, 2025 Undergraduate Record 2024-2025 Select a Catalog Undergraduate Record 2024-2025 Graduate Record 2024-2025 Global Search Catalog Search Choose Search Location Select an option Courses Programs Schools/Colleges & Departments Policies and Other Non-Academic Content Entire Catalog Search Keyword Field Whole Word/Phrase Advanced Search Catalog Navigation Catalog Home Academic Calendar Admission Schools Programs, Degrees & Course Info Student Resources ROTC University Regulations About UVA Archived Records HELP Undergraduate Record 2024-2025'

def remove_text_requirements(text):
    if text.startswith(extra_begining_text):
        text = text[len(extra_begining_text):]
    return text.strip()

#run the function on the text column
schools['Text'] = schools['Text'].apply(remove_text_requirements)

In [None]:
schools

# Add school metadata to majors and convert to dictionary

In [None]:
# copy of the majors data frame
majors_meta = majors
majors_meta['School'] = None
majors_meta = majors_meta.reset_index(drop=True)

#add school to each majors in major data frame
majors_meta['School'][0:3] = 'School of Architecture'
majors_meta['School'][3:62] = 'College of Arts and Sciences'
majors_meta['School'][62:63] = 'McIntire School of Commerce'
majors_meta['School'][63:65] = 'School of Continuing and Professional Studies'
majors_meta['School'][65:66] = 'School of Data Science'
majors_meta['School'][66:72] = 'School of Education and Human Development'
majors_meta['School'][72:83] = 'School of Engineering and Applied Science'
majors_meta['School'][83:84] = 'Frank Batten School of Leadership and Public Policy'
majors_meta['School'][84:85] = 'School of Nursing'

In [None]:
majors_meta.head(10)

In [None]:
#convert majors to dictionary
majors_dict = majors_meta.to_dict(orient='records')
majors_dict

In [None]:
#convert schools to dictionary
schools_dict = schools.to_dict(orient='records')
schools_dict

# Chunking

In [None]:
# disable tokenizer parallelism so we stop getting problems
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
nltk.download("punkt")

# Load tokenizer (for OpenAI models)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Function to chunk text while maintaining sentence structure
def chunk_text(text, max_tokens=500):
    sentences = nltk.tokenize.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_tokens = len(tokenizer.encode(sentence))
        if current_length + sentence_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += sentence_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks



# Store data in ChromaDB vectorized database

In [None]:
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")  
collection = chroma_client.get_or_create_collection(name="Majors_Collection")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

## Don't rerun adding info to database

In [None]:
# Store chunked majors data
for course in majors_dict:
    chunks = chunk_text(course["Text"])
    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()  # Convert to list for ChromaDB
        collection.add(
            ids=[f"{course['Major']}_chunk{i}"],  
            embeddings=[embedding],  
            metadatas=[{
                "Major": course["Major"],
                "chunk_index": i,
                "Text": chunk,
                "School": course['School']
            }]
        )

print("Chunked course descriptions stored in ChromaDB!")

In [None]:
# Store chunked schools data
for course in schools_dict:
    chunks = chunk_text(course["Text"])
    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()  # Convert to list for ChromaDB
        collection.add(
            ids=[f"{course['School']}_chunk{i}"],  
            embeddings=[embedding],  
            metadatas=[{
                "School": course["School"],
                "chunk_index": i,
                "Text": chunk
            }]
        )

print("Chunked schools stored in ChromaDB!")

# Retrieval

In [None]:
# function for document retrieval
#user can input the number of major documents and number of school documents they want to retrieve
def retrieve_major_and_school_info(query, k_m=10, k_s = 2):
    # Step 1: Retrieve major-related chunks
    query_embedding = model.encode(query).tolist()

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k_m
    )

    # Step 2: Extract major-specific results and school information
    major_info = []
    for i in range(len(results["ids"][0])):
        metadata = results["metadatas"][0][i]
        # If the metadata has 'major_name', this is a major document
        if metadata.get("Major"):
            major_info.append(metadata)

    # Step 3: Extract the school name from the major info
    schools_to_query = set([info["School"] for info in major_info])

    # Step 4: Retrieve relevant school descriptions based on the inferred school name
    school_info = []
    for school_name in schools_to_query:
        school_results = collection.query(
            query_embeddings=[query_embedding],
            where={"School": school_name},
            n_results=k_s
        )
        school_info.extend(school_results["metadatas"][0])

    return major_info, school_info

In [None]:
# Example search for "computer science"
query = "course schedule for computer science major"
major_info, school_info = retrieve_major_and_school_info(query, 10, 5)

In [None]:
major_info

In [None]:
school_info

In [None]:
#function to reformat major info into a long string
def format_major_info_string(major_info):
    formatted_texts = []
    for doc in major_info:
        school = doc.get('School', 'Unknown School') #get school or unknown school
        major = doc.get("Major", "Unknown Major")  # Get Major, or "Unknown Major" if missing
        text = doc.get("Text", "No Text Available")  # Get Text, or "No Text Available" if missing

        # Clean up newline characters and non-breaking spaces
        cleaned_text = text.replace("\n", " ").replace("\xa0", " ")

        formatted_texts.append(f"{school}: {major}: {cleaned_text}")

    context = "\n\n".join(formatted_texts)  # Join with double newlines for separation
    return context

In [None]:
#function to reformat school info into a long string
def format_school_info_string(school_info):
    formatted_texts = []
    for doc in school_info:
        school = doc.get('School', 'Unknown School') #get school or unknown school
        text = doc.get("Text", "No Text Available")  # Get Text, or "No Text Available" if missing

        # Clean up newline characters and non-breaking spaces
        cleaned_text = text.replace("\n", " ").replace("\xa0", " ")
        #cleaned_text = " ".join(text.split()) 

        formatted_texts.append(f"{school} \n {cleaned_text}")

    context = "\n\n".join(formatted_texts)  # Join with double newlines for separation
    return context

# Generation

In [None]:
#get token from .env file

# Load environment variables from .env file
load_dotenv()

# Retrieve the token
token = os.getenv("TOKEN")


In [None]:
# log into huggingface
login(token = token)

In [None]:
#set up torch
#if torch.backends.mps.is_available():
    #torch_device = 'mps'
if torch.cuda.is_available():
    torch_device = 'cuda'
else:
    torch_device = 'cpu'

In [None]:
#load mistral model (don't run locally only run on AWS because of model size)
model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
processor = AutoProcessor.from_pretrained(model_checkpoint)
model = AutoModelForImageTextToText.from_pretrained( 
    model_checkpoint, 
    device_map = torch_device, 
    torch_dtype = torch.float32
)

In [None]:
# set system prompt
#set system prompt
SYSTEM_PROMPT = """You are a UVA undergraduate course scheduler. You use retrieved documents to create a schedule for one semester. 
Do not make up any courses, only use information from the documents. When you're not sure about some information, you say that you don't have the information and don't make up anything.
If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request.
A full time student should have between 12-15 credits per semester."""

In [None]:
#function to generate response from LLM
def generate_response(query, k_m, k_s):
    '''retrieve relevant documents from majors and schools and generate response to user query using mistral 3 model'''

    #get documents
    major_info, school_info = retrieve_major_and_school_info(query, k_m, k_s)
    major = format_major_info_string(major_info)
    school = format_school_info_string(school_info)
    context = major + school

    #construct message
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role':'user', 'content' : f"Context:\n{context}\n\nQuestion: {query}"}
    ]

    #process input for mistral
    text = processor.apply_chat_template(messages, tokenize = False, add_generation_prompt=True)
    inputs = processor(text=text, return_tensors='pt').to(torch_device, dtype = torch.float32)

    #generate response
    generate_id = model.generate(**inputs, max_new_tokens = 200, do_sample = False)
    decoded_output = processor.batch_decode(generate_id[:,inputs['input_ids'].shape[1]:], skip_special_tokens = True)[0]

    return decoded_output

In [None]:
user_query = "I'm a computer science major at UVA and I need a schedule for the spring semester of my third year"
response = generate_response(user_query, 10, 5)