# Import required packages

In [None]:
#!pip install llama-cpp-python

In [1]:
#load packages
import pandas as pd
from functools import partial
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import json
import tiktoken
import nltk
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import AutoProcessor, AutoTokenizer, AutoModelForImageTextToText, AutoModelForCausalLM, AutoModel 
import torch
from huggingface_hub import login
from dotenv import load_dotenv
import torchvision
from llama_cpp import Llama
import requests

# Load in Documents

In this section we will load in the scraped documents and clean them.

In [2]:
#read in csv for majors
majors = pd.read_csv("Scraped_data/uva_majors.csv")

#remove non major data
majors = majors.iloc[8:-1]


In [3]:
# remove the extra text from the text column
test = 'Info For Students Alumni Military Affiliated Students Faculty & Staff Search Search Submit Search Close search Info For Students Alumni Military Affiliated Students Faculty & Staff Calendars Academic Calendars Exam Schedules Student Records Diplomas Transcripts FERPA Programs Calendar Schools University Registrar Carruthers Hall, 1001 N. Emmet St. P.O. Box 400203 Charlottesville, VA 22904-4203 Staff Directory Contact Info Phone: (434) 924-4122 Fax: (434) 924-4156 Email: [email protected] Hours Of Operation M-F: 10am - noon and 1pm - 4pm Â© 2024 By the Rector and Visitors of the University of Virginia Legal Privacy Report a Barrier Share Your Feedback University of Virginia Mar 31, 2025 Undergraduate Record 2024-2025 Select a Catalog Undergraduate Record 2024-2025 Graduate Record 2024-2025 Global Search Catalog Search Choose Search Location Select an option Courses Programs Schools/Colleges & Departments Policies and Other Non-Academic Content Entire Catalog Search Keyword Field Whole Word/Phrase Advanced Search Catalog Navigation Catalog Home Academic Calendar Admission Schools Programs, Degrees & Course Info Student Resources ROTC University Regulations About UVA Archived Records HELP Undergraduate Record 2024-2025'
text_to_remove_head = 'Info For Students Alumni Military Affiliated Students Faculty & Staff Search Search Submit Search Close search Info For Students Alumni Military Affiliated Students Faculty & Staff Calendars Academic Calendars Exam Schedules Student Records Diplomas Transcripts FERPA Programs Calendar Schools University Registrar Carruthers Hall, 1001 N. Emmet St. P.O. Box 400203 Charlottesville, VA 22904-4203 Staff Directory Contact Info Phone: (434) 924-4122 Fax: (434) 924-4156 Email: [email protected] Hours Of Operation M-F: 10am - noon and 1pm - 4pm Â© 2024 By the Rector and Visitors of the University of Virginia Legal Privacy Report a Barrier Share Your Feedback University of Virginia Mar 12, 2025 Undergraduate Record 2024-2025 Select a Catalog Undergraduate Record 2024-2025 Graduate Record 2024-2025 Global Search Catalog Search Choose Search Location Select an option Courses Programs Schools/Colleges & Departments Policies and Other Non-Academic Content Entire Catalog Search Keyword Field Whole Word/Phrase Advanced Search Catalog Navigation Catalog Home Academic Calendar Admission Schools Programs, Degrees & Course Info Student Resources ROTC University Regulations About UVA Archived Records HELP Undergraduate Record 2024-2025' 
text_to_remove_tail = 'Back to Top | Print-Friendly Page (opens a new window) All catalogs © 2025 University of Virginia. Powered by Modern Campus Catalog™ . .'
#function to remove the text
def remove_text(text):
    if text.startswith(text_to_remove_head):
        text = text[len(text_to_remove_head):]
    if text.startswith(test):
        text = text[len(test):]
    if text.endswith(text_to_remove_tail):
        text = text[:-len(text_to_remove_tail)]
    return text.strip()

#apply the function to the text column
majors['Text'] = majors['Text'].apply(remove_text)

#since some of the majors have extra text that is a bit different, we need to remove it
majors[majors['Major'] == 'Psychology, B.A.']['Text']

#majors[50:90]

64    Psychology, B.A. Print-Friendly Page (opens a ...
Name: Text, dtype: object

In [4]:
# remove extra text from the majors column where majors includes "interdisciplinary"
text_to_remove = 'Interdisciplinary - '
text_to_remove2 = 'Interdisciplinary Major - '
def remove_text_majors(text):
    if text.startswith(text_to_remove):
        text = text[len(text_to_remove):]
    if text.startswith(text_to_remove2):
        text = text[len(text_to_remove2):]
    return text.strip()

#apply the function to the majors column
majors['Major'] = majors['Major'].apply(remove_text_majors)

In [5]:
majors[20:40]

Unnamed: 0,Major,Text
28,"English, B.A.","English, B.A. Print-Friendly Page (opens a new..."
29,"Environmental Sciences, B.A.","Environmental Sciences, B.A. Print-Friendly Pa..."
30,"Environmental Sciences, B.S.","Environmental Sciences, B.S. Print-Friendly Pa..."
31,"Foreign Affairs, B.A.","Foreign Affairs, B.A. Print-Friendly Page (ope..."
32,"French, B.A.","French, B.A. Print-Friendly Page (opens a new ..."
33,"German, B.A.","German, B.A. Print-Friendly Page (opens a new ..."
34,"Government, B.A.","Government, B.A. Print-Friendly Page (opens a ..."
35,"History, B.A.","History, B.A. Print-Friendly Page (opens a new..."
36,American Studies,Interdisciplinary Major - American Studies Pri...
37,Archaeology,Interdisciplinary Major - Archaeology Print-Fr...


In [6]:
#read in schools data
schools = pd.read_csv("Scraped_data/uva_schools_academic_rules.csv")
schools

Unnamed: 0,School,Text
0,School of Architecture,Info For Students Alumni Military Affiliated S...
1,College of Arts & Sciences,Info For Students Alumni Military Affiliated S...
2,McIntire School of Commerce,Info For Students Alumni Military Affiliated S...
3,School of Continuing and Professional Studies,Info For Students Alumni Military Affiliated S...
4,School of Data Science,Info For Students Alumni Military Affiliated S...
5,School of Education and Human Development,Info For Students Alumni Military Affiliated S...
6,School of Engineering and Applied Science,Info For Students Alumni Military Affiliated S...
7,Frank Batten School of Leadership and Public P...,Info For Students Alumni Military Affiliated S...
8,School of Nursing,Info For Students Alumni Military Affiliated S...


In [7]:
# remove the extra text from the text column
extra_begining_text = 'Info For Students Alumni Military Affiliated Students Faculty & Staff Search Search Submit Search Close search Info For Students Alumni Military Affiliated Students Faculty & Staff Calendars Academic Calendars Exam Schedules Student Records Diplomas Transcripts FERPA Programs Calendar Schools University Registrar Carruthers Hall, 1001 N. Emmet St. P.O. Box 400203 Charlottesville, VA 22904-4203 Staff Directory Contact Info Phone: (434) 924-4122 Fax: (434) 924-4156 Email: [email protected] Hours Of Operation M-F: 10am - noon and 1pm - 4pm Â© 2024 By the Rector and Visitors of the University of Virginia Legal Privacy Report a Barrier Share Your Feedback University of Virginia Mar 31, 2025 Undergraduate Record 2024-2025 Select a Catalog Undergraduate Record 2024-2025 Graduate Record 2024-2025 Global Search Catalog Search Choose Search Location Select an option Courses Programs Schools/Colleges & Departments Policies and Other Non-Academic Content Entire Catalog Search Keyword Field Whole Word/Phrase Advanced Search Catalog Navigation Catalog Home Academic Calendar Admission Schools Programs, Degrees & Course Info Student Resources ROTC University Regulations About UVA Archived Records HELP Undergraduate Record 2024-2025'

def remove_text_requirements(text):
    if text.startswith(extra_begining_text):
        text = text[len(extra_begining_text):]
    return text.strip()

#run the function on the text column
schools['Text'] = schools['Text'].apply(remove_text_requirements)

In [8]:
schools

Unnamed: 0,School,Text
0,School of Architecture,"School of Architecture: Academic Rules, Regula..."
1,College of Arts & Sciences,"College of Arts & Sciences: Academic Rules, Re..."
2,McIntire School of Commerce,"McIntire School of Commerce: Academic Rules, R..."
3,School of Continuing and Professional Studies,School of Continuing and Professional Studies:...
4,School of Data Science,School of Data Science: Academic Rules Print-F...
5,School of Education and Human Development,School of Education and Human Development: Aca...
6,School of Engineering and Applied Science,School of Engineering and Applied Science: Aca...
7,Frank Batten School of Leadership and Public P...,Frank Batten School of Leadership and Public P...
8,School of Nursing,"School of Nursing: Academic Rules, Regulations..."


In [9]:
# read in class data
classes = pd.read_csv("Lous_List_Database/Fall_25_UVA_w_descr.csv")
classes.head()

Unnamed: 0,ClassNumber,Mnemonic,Number,Section,Type,Units,Instructor,Days,Room,Title,Topic,Status,Enrollment,EnrollmentLimit,Waitlist,Description
0,10003,AAS,1010,100,Lecture,4,"Robert Vinson, Naseemah Mohamed",TuTh 12:30pm - 1:45pm,Minor Hall 125,Introduction to African-American and African S...,,Open,0,180,0,This introductory course surveys the histories...
1,12774,AAS,1010,101,Discussion,0,To Be Announced,We 6:00pm - 6:50pm,New Cabell Hall 283,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...
2,10006,AAS,1010,102,Discussion,0,To Be Announced,We 5:00pm - 5:50pm,New Cabell Hall 287,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...
3,10004,AAS,1010,103,Discussion,0,To Be Announced,Tu 7:00pm - 7:50pm,New Cabell Hall 411,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...
4,10005,AAS,1010,104,Discussion,0,To Be Announced,Tu 8:00pm - 8:50pm,New Cabell Hall 411,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...


# Add school metadata to majors and convert to dictionary

In [10]:
# copy of the majors data frame
majors_meta = majors
majors_meta['School'] = None
majors_meta = majors_meta.reset_index(drop=True)

#add school to each majors in major data frame
majors_meta['School'][0:3] = 'School of Architecture'
majors_meta['School'][3:62] = 'College of Arts and Sciences'
majors_meta['School'][62:63] = 'McIntire School of Commerce'
majors_meta['School'][63:65] = 'School of Continuing and Professional Studies'
majors_meta['School'][65:66] = 'School of Data Science'
majors_meta['School'][66:72] = 'School of Education and Human Development'
majors_meta['School'][72:83] = 'School of Engineering and Applied Science'
majors_meta['School'][83:84] = 'Frank Batten School of Leadership and Public Policy'
majors_meta['School'][84:85] = 'School of Nursing'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  majors_meta['School'][0:3] = 'School of Architecture'
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the origina

In [11]:
majors_meta.head(10)

Unnamed: 0,Major,Text,School
0,"Architectural History, B.Ar.H.","Architectural History, B.Ar.H. Print-Friendly ...",School of Architecture
1,"Architecture, B.S.","Architecture, B.S. Print-Friendly Page (opens ...",School of Architecture
2,"Urban and Environmental Planning, B.U.E.P.","Urban and Environmental Planning, B.U.E.P. Pri...",School of Architecture
3,"African-American and African Studies, B.A.","African American and African Studies, B.A. Pri...",College of Arts and Sciences
4,"Anthropology, B.A.","Anthropology, B.A. Print-Friendly Page (opens ...",College of Arts and Sciences
5,"Applied Statistics, B.A.","Applied Statistics, B.A. Print-Friendly Page (...",College of Arts and Sciences
6,"Area Studies, B.A.","Area Studies, B.A. Print-Friendly Page (opens ...",College of Arts and Sciences
7,Art History,History of Art Print-Friendly Page (opens a ne...,College of Arts and Sciences
8,Studio Art,Studio Art Print-Friendly Page (opens a new wi...,College of Arts and Sciences
9,"Astronomy, B.A.","Astronomy, B.A. Print-Friendly Page (opens a n...",College of Arts and Sciences


In [12]:
#convert majors to dictionary
majors_dict = majors_meta.to_dict(orient='records')
majors_dict[0:2]

[{'Major': 'Architectural History, B.Ar.H.',
  'Text': 'Architectural History, B.Ar.H. Print-Friendly Page (opens a new window) Return to: School of Architecture: Degree Programs Universal Curriculum Requirements To be awarded a degree from the School of Architecture, students are required to complete universal curriculum requirements in addition to the program requirements provided below. The school universal curriculum requirements can be found on the school Degree Programs page . Program Requirements Undergraduate students entering the School of Architecture share a Common First Year in the School of Architecture . Students take core courses in Architectural History, Architecture, and Urban & Environmental Planning to provide a framework for the study of\xa0the built environment\xa0through observation, analysis, and design. Students must pass each core course with a grade of C- or higher. During the spring semester of the first year, students choose an intended major: Bachelor of Ar

In [13]:
#convert schools to dictionary
schools_dict = schools.to_dict(orient='records')
schools_dict[0:2]

[{'School': 'School of Architecture',
 {'School': 'College of Arts & Sciences',

In [14]:
# dictionary for school to classes matching with Mnemonic
#query sis api
api_url = 'https://sisuva.admin.virginia.edu/psc/ihprd/UVSS/SA/s/WEBLIB_HCX_CM.H_CLASS_SEARCH.FieldFormula.IScript_ClassSearchOptions?institution=UVA01&term=1248'

r = requests.get(api_url)

sis = json.loads(r.text)
sis_df = pd.DataFrame(sis['subjects'])


In [15]:
# remove subject letters from descr column
sis_df['descr'] = sis_df.apply(lambda row: row['descr'].replace(row['subject'], ''), axis=1)
# remove extra spaces from descr column
sis_df['descr'] = sis_df['descr'].str.replace(r'- ', '', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
sis_df.head()

Unnamed: 0,subject,descr,acad_groups,acad_orgs,careers,campuses
0,AAS,African-American and African Studies,[CGAS],[AAS],"[GRAD, UGRD]",[MAIN]
1,ACCT,Accounting,[SCPS],[SCPSD],"[GRAD, UGRD]",[SCPS]
2,AIRS,Air Science,[PROV],[AIRS],[UGRD],[MAIN]
3,ALAR,Architecture and Landscape Architecture,[ARCH],[ALAR],[GRAD],[MAIN]
4,AMST,American Studies,[CGAS],[AMST],"[GRAD, UGRD]",[MAIN]


In [16]:
# Perform a left merge
merged_classes = classes.merge(sis_df[['subject', 'descr']], how='left', left_on='Mnemonic', right_on='subject')

# Drop the redundant 'subject' column from the merged DataFrame
merged_classes = merged_classes.drop(columns=['subject'])

# add column for semester = Fall and column for year = spring
merged_classes['semester'] = 'Fall'
merged_classes['year'] = '2025'

#replace nan description with 'none'
merged_classes['Description'] = merged_classes['Description'].fillna('none')


# Display the first few rows of the merged DataFrame
merged_classes.head()

Unnamed: 0,ClassNumber,Mnemonic,Number,Section,Type,Units,Instructor,Days,Room,Title,Topic,Status,Enrollment,EnrollmentLimit,Waitlist,Description,descr,semester,year
0,10003,AAS,1010,100,Lecture,4,"Robert Vinson, Naseemah Mohamed",TuTh 12:30pm - 1:45pm,Minor Hall 125,Introduction to African-American and African S...,,Open,0,180,0,This introductory course surveys the histories...,African-American and African Studies,Fall,2025
1,12774,AAS,1010,101,Discussion,0,To Be Announced,We 6:00pm - 6:50pm,New Cabell Hall 283,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...,African-American and African Studies,Fall,2025
2,10006,AAS,1010,102,Discussion,0,To Be Announced,We 5:00pm - 5:50pm,New Cabell Hall 287,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...,African-American and African Studies,Fall,2025
3,10004,AAS,1010,103,Discussion,0,To Be Announced,Tu 7:00pm - 7:50pm,New Cabell Hall 411,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...,African-American and African Studies,Fall,2025
4,10005,AAS,1010,104,Discussion,0,To Be Announced,Tu 8:00pm - 8:50pm,New Cabell Hall 411,Introduction to African-American and African S...,,Open,0,20,0,This introductory course surveys the histories...,African-American and African Studies,Fall,2025


In [17]:
#convdert classes to dictionary
classes_dict = merged_classes.to_dict(orient='records')
classes_dict[0:2]

[{'ClassNumber': 10003,
  'Mnemonic': 'AAS',
  'Number': '1010',
  'Section': '100',
  'Type': 'Lecture',
  'Units': '4',
  'Instructor': 'Robert Vinson, Naseemah Mohamed',
  'Days': 'TuTh 12:30pm - 1:45pm',
  'Room': 'Minor Hall 125',
  'Title': 'Introduction to African-American and African Studies I',
  'Topic': nan,
  'Status': 'Open',
  'Enrollment': 0,
  'EnrollmentLimit': 180,
  'Waitlist': 0,
  'Description': 'This introductory course surveys the histories of people of African descent in Africa, the Americas, and the Caribbean from approximately the Middle Ages to the 1880s. Emphases include the Atlantic slave trade and its complex relationship to Africa; the economic systems, cultures, and communities of Africans and African-Americans in the New World, in slavery and in freedom; the rise of anti-slavery movements; and the socio-economic systems that replaced slavery in the late 19th century.',
  'descr': 'African-American and African Studies',
  'semester': 'Fall',
  'year': '2

# Chunking

In [18]:
# disable tokenizer parallelism so we stop getting problems
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [19]:
nltk.download("punkt")

# Load tokenizer (for OpenAI models)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Function to chunk text while maintaining sentence structure
def chunk_text(text, max_tokens=500):
    sentences = nltk.tokenize.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_tokens = len(tokenizer.encode(sentence))
        if current_length + sentence_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += sentence_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks



[nltk_data] Downloading package punkt to /Users/rfell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Store data in ChromaDB vectorized database

In [20]:
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")  
collection = chroma_client.get_or_create_collection(name="Majors_Collection")
#create separate collection for classes
collection_classes = chroma_client.get_or_create_collection(name="Classes_Collection")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

## Don't rerun adding info to database

In [21]:
# Store chunked majors data
for course in majors_dict:
    chunks = chunk_text(course["Text"])
    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()  # Convert to list for ChromaDB
        collection.add(
            ids=[f"{course['Major']}_chunk{i}"],  
            embeddings=[embedding],  
            metadatas=[{
                "Major": course["Major"],
                "chunk_index": i,
                "Text": chunk,
                "School": course['School']
            }]
        )

print("Chunked course descriptions stored in ChromaDB!")

Insert of existing embedding ID: Architectural History, B.Ar.H._chunk0
Add of existing embedding ID: Architectural History, B.Ar.H._chunk0
Insert of existing embedding ID: Architectural History, B.Ar.H._chunk1
Add of existing embedding ID: Architectural History, B.Ar.H._chunk1
Insert of existing embedding ID: Architectural History, B.Ar.H._chunk2
Add of existing embedding ID: Architectural History, B.Ar.H._chunk2
Insert of existing embedding ID: Architectural History, B.Ar.H._chunk3
Add of existing embedding ID: Architectural History, B.Ar.H._chunk3
Insert of existing embedding ID: Architecture, B.S._chunk0
Add of existing embedding ID: Architecture, B.S._chunk0
Insert of existing embedding ID: Architecture, B.S._chunk1
Add of existing embedding ID: Architecture, B.S._chunk1
Insert of existing embedding ID: Architecture, B.S._chunk2
Add of existing embedding ID: Architecture, B.S._chunk2
Insert of existing embedding ID: Urban and Environmental Planning, B.U.E.P._chunk0
Add of existing 

Chunked course descriptions stored in ChromaDB!


In [22]:
# Store chunked schools data
for course in schools_dict:
    chunks = chunk_text(course["Text"])
    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()  # Convert to list for ChromaDB
        collection.add(
            ids=[f"{course['School']}_chunk{i}"],  
            embeddings=[embedding],  
            metadatas=[{
                "School": course["School"],
                "chunk_index": i,
                "Text": chunk
            }]
        )

print("Chunked schools stored in ChromaDB!")

Insert of existing embedding ID: School of Architecture_chunk0
Add of existing embedding ID: School of Architecture_chunk0
Insert of existing embedding ID: School of Architecture_chunk1
Add of existing embedding ID: School of Architecture_chunk1
Insert of existing embedding ID: School of Architecture_chunk2
Add of existing embedding ID: School of Architecture_chunk2
Insert of existing embedding ID: School of Architecture_chunk3
Add of existing embedding ID: School of Architecture_chunk3
Insert of existing embedding ID: School of Architecture_chunk4
Add of existing embedding ID: School of Architecture_chunk4
Insert of existing embedding ID: School of Architecture_chunk5
Add of existing embedding ID: School of Architecture_chunk5
Insert of existing embedding ID: School of Architecture_chunk6
Add of existing embedding ID: School of Architecture_chunk6
Insert of existing embedding ID: School of Architecture_chunk7
Add of existing embedding ID: School of Architecture_chunk7
Insert of existi

Chunked schools stored in ChromaDB!


add course number, section, semester, year, etc
this might improve


In [23]:


#store classes in collection
for course in classes_dict:
    embedding = model.encode(course["Description"]).tolist()  # Convert to list for ChromaDB
    collection_classes.add(
        ids=[f"{course['ClassNumber']}_{course['Mnemonic']}_{course['Number']}_{course['Section']}"],  
        embeddings=[embedding],  
        metadatas=[{
            "Code": course["Mnemonic"],
            "Number": course["Number"],
            "Section": course["Section"],
            "Type": course["Type"],
            "Units": course["Units"],
            "Instructor": course["Instructor"],
            "Days": course["Days"],
            "Room": course["Room"],
            "Title": course["Title"],
            'Semester': course['semester'],
            'Year': course['year'], 
            "Type": 'Class',
            "Description": course["Description"],
            "Department": course["descr"]
        }]
    )

print("Classes data stored in ChromaDB!")


Add of existing embedding ID: 10003_AAS_1010_100
Insert of existing embedding ID: 10003_AAS_1010_100
Add of existing embedding ID: 12774_AAS_1010_101
Insert of existing embedding ID: 12774_AAS_1010_101
Add of existing embedding ID: 10006_AAS_1010_102
Insert of existing embedding ID: 10006_AAS_1010_102
Add of existing embedding ID: 10004_AAS_1010_103
Insert of existing embedding ID: 10004_AAS_1010_103
Add of existing embedding ID: 10005_AAS_1010_104
Insert of existing embedding ID: 10005_AAS_1010_104
Add of existing embedding ID: 10007_AAS_1010_105
Insert of existing embedding ID: 10007_AAS_1010_105
Add of existing embedding ID: 10008_AAS_1010_106
Insert of existing embedding ID: 10008_AAS_1010_106
Add of existing embedding ID: 11700_AAS_1010_107
Insert of existing embedding ID: 11700_AAS_1010_107
Add of existing embedding ID: 11701_AAS_1010_108
Insert of existing embedding ID: 11701_AAS_1010_108
Add of existing embedding ID: 11702_AAS_1010_109
Insert of existing embedding ID: 11702_AAS

KeyboardInterrupt: 

# Retrieval

In [24]:
# function for document retrieval
#user can input the number of major documents and number of school documents they want to retrieve
def retrieve_major_and_school_info(query, k_m=10, k_s = 2):
    # Step 1: Retrieve major-related chunks
    query_embedding = model.encode(query).tolist()

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k_m
    )

    # Step 2: Extract major-specific results and school information
    major_info = []
    for i in range(len(results["ids"][0])):
        metadata = results["metadatas"][0][i]
        # If the metadata has 'major_name', this is a major document
        if metadata.get("Major"):
            major_info.append(metadata)

    # Step 3: Extract the school name from the major info
    schools_to_query = set([info["School"] for info in major_info])

    # Step 4: Retrieve relevant school descriptions based on the inferred school name
    school_info = []
    for school_name in schools_to_query:
        school_results = collection.query(
            query_embeddings=[query_embedding],
            where={"School": school_name},
            n_results=k_s
        )
        school_info.extend(school_results["metadatas"][0])

    return major_info, school_info

In [51]:
# Example search for "computer science"
query = "course schedule for computer science major"
major_info, school_info = retrieve_major_and_school_info(query, 10, 5)

In [53]:
major_info

[{'Major': 'Computer Science, B.S.',
  'School': 'School of Engineering and Applied Science',
  'Text': 'The 2000 level courses should be taken before the 3000 level courses and note that there are other prerequisites that govern the order that these courses should be taken. Example schedules can be found on the Computer Science Department webpage . CS 2100\xa0-\xa0Data Structures and Algorithms 1 Credits: 4 CS 2120\xa0-\xa0Discrete Mathematics and Theory 1 Credits: 3 CS 2130\xa0-\xa0Computer Systems and Organization 1 Credits: 4 CS 3100\xa0-\xa0Data Structures and Algorithms 2 Credits:          3 CS 3120\xa0-\xa0Discrete Mathematics and Theory 2 Credits: 3 CS 3130\xa0-\xa0Computer Systems and Organization 2 Credits: 4 CS 3140\xa0-\xa0Software Development Essentials Credits: 3 Upper-Level Required Courses BSCS majors must take one upper-level course in software engineering and one course to complete the SEAS senior thesis. CS 3240\xa0-\xa0Software Engineering Credits:          3 The se

In [50]:
school_info[0:2]

[{'Major': 'Data Science, B.S.',
  'School': 'School of Data Science',
  'Text': 'DS 1001\xa0-\xa0Foundation of Data Science Credits:          3 Programming Requirement There are multiple ways this requirement can be met. Enrollment in one of the following courses at UVA: DS 1002\xa0-\xa0Programming for Data Science Credits:          3 CS 1110\xa0-\xa0Introduction to Programming Credits:          3 CS 1111\xa0-\xa0Introduction to Programming Credits:          3 CS 1112\xa0-\xa0Introduction to Programming Credits:          3 CS 1113\xa0-\xa0Introduction to Programming Credits:          3 PHYS 1655\xa0-\xa0Introduction to Python for Scientists and Engineers Credits: 3 OR Test credit AP Computer Science A with a score if 4 or 5 IB HL Computer Science with a score of 5, 6, or 7 OR Passing the CS 1110 Place-Out Test Students who pass the test\xa0receive a notation in SIS. The School of Data Science encourages students to complete the place-out test in Python. Students must be proficient in 

# Functions to transform output of retrieval to long string

In [28]:
#function to reformat major info into a long string
def format_major_info_string(major_info):
    formatted_texts = []
    for doc in major_info:
        school = doc.get('School', 'Unknown School') #get school or unknown school
        major = doc.get("Major", "Unknown Major")  # Get Major, or "Unknown Major" if missing
        text = doc.get("Text", "No Text Available")  # Get Text, or "No Text Available" if missing

        # Clean up newline characters and non-breaking spaces
        cleaned_text = text.replace("\n", " ").replace("\xa0", " ")

        formatted_texts.append(f"{school}: {major}: {cleaned_text}")

    context = "\n\n".join(formatted_texts)  # Join with double newlines for separation
    return context

In [29]:
#function to reformat school info into a long string
def format_school_info_string(school_info):
    formatted_texts = []
    for doc in school_info:
        school = doc.get('School', 'Unknown School') #get school or unknown school
        text = doc.get("Text", "No Text Available")  # Get Text, or "No Text Available" if missing

        # Clean up newline characters and non-breaking spaces
        cleaned_text = text.replace("\n", " ").replace("\xa0", " ")
        #cleaned_text = " ".join(text.split()) 

        formatted_texts.append(f"{school} \n {cleaned_text}")

    context = "\n\n".join(formatted_texts)  # Join with double newlines for separation
    return context

In [30]:
def format_class_info_string(class_info):
    formatted_texts = []
    for doc in class_info:
        code = doc.get("Code", "Unknown Code")  # Get Code, or "Unknown Code" if missing
        number = doc.get("Number", "Unknown Number")  # Get Number, or "Unknown Number" if missing
        section = doc.get("Section", "Unknown Section")  # Get Section, or "Unknown Section" if missing
        type = doc.get("Type", "Unknown Type")  # Get Type, or "Unknown Type" if missing
        units = doc.get("Units", "Unknown Units")  # Get Units, or "Unknown Units" if missing
        instructor = doc.get("Instructor", "Unknown Instructor")  # Get Instructor, or "Unknown Instructor" if missing
        days = doc.get("Days", "Unknown Days")  # Get Days, or "Unknown Days" if missing
        room = doc.get("Room", "Unknown Room")  # Get Room, or "Unknown Room" if missing
        title = doc.get("Title", "No Title Available")  # Get Title, or "No Title Available" if missing
        description = doc.get("Description", "No Description Available")  # Get Description, or "No Description Available" if missing
        semester = doc.get("Semester", "Unknown Semester")
        year = doc.get("Year", "Unknown Year")
        department = doc.get("Department", "Unknown Department")

        # Clean up newline characters and non-breaking spaces
        cleaned_description = description.replace("\n", " ").replace("\xa0", " ")

        formatted_texts.append(f"code:{code}, number: {number}, semester: {semester}, year:{year},section: {section}, units: {units}, type: {type} instructor: {instructor}, days: {days}, room: {room},department {department},  title: {title} course desrcription: {cleaned_description}")

    context = "\n\n".join(formatted_texts)  # Join with double newlines for separation
    return context

# Generation

In [31]:
#get token from .env file

# Load environment variables from .env file
load_dotenv()

# Retrieve the token
token = os.getenv("TOKEN")


In [32]:
# log into huggingface
login(token = token)

In [33]:
#set system prompt
SYSTEM_PROMPT_1 = """You are a University of Virginia undergraduate course scheduler. 
The output of this prompt will be used to search for classes, by looking for embedding similarities, in a ChromaDB vector database so make sure the output is clear, concise and under 500 tokens.
The classes database includes information on course times, descriptions, professors, and other relevant information.
As background information: You use retrieved documents to create a schedule for one semester. A full time student should have between 12-15 credits per semester. 
Do not make up any courses, only use information from the context.
Do not include any non relevant information like courses you aren't taking, or any other information that is not relevant to the course schedule.
The user will ask you to create a schedule for a specific major or school and year. Do not give a third or fourth year student all introductory classes and do not give a first year student all high level classes. The user may also provide possible additional information such as professors, days of the week, or specific courses.
"""

## Using Llamma cpp package

In [34]:
#load 6-bit quantized mistral model
llm = Llama.from_pretrained(
	repo_id="bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF",
	filename="mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ3_XXS.gguf",
    n_ctx = 20000, #token size, might be excessive (about 75-100 pages of text)
    n_gpu_layers=-1,  # Use Metal GPU acceleration
    n_threads=8,
    verbose=True 
)

llama_model_load_from_file_impl: using device Metal (Apple M3 Max) - 26578 MiB free
llama_model_loader: loaded meta data with 45 key-value pairs and 363 tensors from /Users/rfell/.cache/huggingface/hub/models--bartowski--mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF/snapshots/6f1e0225e5ab39e4904d2ff2cc4e7805b416eff3/./mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ3_XXS.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Mistral Small 3.1 24B Instruct 2503
llama_model_loader: - kv   3:                            general.version str              = 2503
llama_model_loader: - kv   4:                           general.finetu

 original model used:
 repo_id="bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF",
	filename="mistralai_Mistral-Small-3.1-24B-Instruct-2503-IQ2_M.gguf",

This is the 24 billion parameter mistral model, but i changed it out for the 7 billion parameter model. we could also use a more agressive quantization for the 24 b model if it is much more accurate, ex: IQ3_XXS instead of IQ2_M 

repo_id="MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
	filename="Mistral-7B-Instruct-v0.3.IQ1_M.gguf


In [35]:
#function to generate response from LLM using llama-cpp-python library
def generate_response(major_info, school_info, system_promot, query, k_m, k_s):
    '''retrieve relevant documents from majors and schools and generate response to user query using mistral 3 model'''

    #get documents
    major_info, school_info = retrieve_major_and_school_info(query, k_m, k_s)
    major = format_major_info_string(major_info)
    school = format_school_info_string(school_info)
    context = major + school

    #construct prompt
    prompt = f"{system_promot}\n\nContext:\n{context}\n\nQuestion: {query}"

    #generate response using llama-cpp-python
    response = llm(
        prompt=prompt,
        max_tokens=500,  # Adjust max tokens as needed
        temperature=0.3,  # Adjust temperature as needed
        top_p=0.5       # Adjust top_p as needed
        #stop=["\n"]      # Define stop tokens if necessary
    )

    return response["choices"][0]["text"].strip()

## Using transformers library

# 2nd Step of RAG

here I will call previous functions that retrieve major and school info, generate a response and use that tho retrieve the classes and then generate a response again.

In [None]:
SYSTEM_PROMPT_2 = """
You are a University of Virginia undergraduate course scheduler.
Use the given context to create a course schedule based on the user query.
The schedule should include corse names, course numbers, required prerequisites, course times, instructors, room, units, and any other relevant information.
Make sure  the output is clear, concise and easy to read in a table format and make sure none of the times slots overlap.
If a class has more than one time, choose one that does not overlap with other classes.
A full time student should have between 12-15 credits per semester. No classes should be repeated on one schedule. When it comes to courses like Special Topics, Independent Study, or Research, please highlight that this is not a standard course.
A third or fourth year student should not be given all introductory classes and a first year or second year student should not have all high level classes.
The schedule should include optional courses, online/hybrid options, or specific time preferences.
The user may also ask for specific professors, days of the week, or specific courses.
Please use a professional tone and avoid any unnecessary information.

"""

In [45]:
def two_step_retrieve_and_generate_response(system_prompt1, system_prompt2, query, k_m=10, k_s=5, k_c=20):
    # Step 1: Retrieve major and school related documents
    major_info, school_info = retrieve_major_and_school_info(query, k_m, k_s)
    
    #Step 2: generate response
    response = generate_response(major_info, school_info, system_prompt1, query, k_m, k_s)


    # Step 3: Retrieve class-related documents using the initial response
    response_embedding = model.encode(response).tolist()
    class_info = []
    class_results = collection_classes.query(
        query_embeddings=[response_embedding],
        where={"Type": "Class"},  # Filter for class-related documents
        n_results=k_c
    )
    class_info = class_results["metadatas"][0]


    # Step 4: Feed class documents to the LLM along with previous
    class_context = format_class_info_string(class_info)
    final_prompt = f"{system_prompt2}\n\nMajor Context:\n{major_info}\n\nSchool Context :\n{school_info}\n\nClass Context:\n{class_context}\n\nUser Query: {query}\n\n. Generate a response."
    final_response = llm(
        prompt=final_prompt,
        max_tokens=2000,  # Adjust max tokens as needed
        temperature=0.3,  # Adjust temperature as needed
        top_p=0.5       # Adjust top_p as needed
        #stop=["\n"]      # Define stop tokens if necessary
    )

    return final_response

In [56]:
schedule = two_step_retrieve_and_generate_response(SYSTEM_PROMPT_1, SYSTEM_PROMPT_2, "second year biology major fall 2025", 10, 5, 20)

Llama.generate: 1 prefix-match hit, remaining 6081 prompt tokens to eval


KeyboardInterrupt: 

In [55]:
print(schedule["choices"][0]["text"].strip())

The capital of Russia is Moscow.
