# Import required packages

In [130]:
#load packages
import pandas as pd
import os
import json
import chromadb
from sentence_transformers import SentenceTransformer
import torch
from huggingface_hub import login
from dotenv import load_dotenv
import torchvision

# Load in Documents

In [136]:
# Read in csv for majors
majors_meta = pd.read_csv("Scraped_data/majors_meta_cleaned.csv")
majors_meta.head()

Unnamed: 0,Major,Text,School
0,"Architectural History, B.Ar.H.","Architectural History, B.Ar.H. Print-Friendly ...",School of Architecture
1,"Architecture, B.S.","Architecture, B.S. Print-Friendly Page (opens ...",School of Architecture
2,"Urban and Environmental Planning, B.U.E.P.","Urban and Environmental Planning, B.U.E.P. Pri...",School of Architecture
3,"African-American and African Studies, B.A.","African American and African Studies, B.A. Pri...",College of Arts and Sciences
4,"Anthropology, B.A.","Anthropology, B.A. Print-Friendly Page (opens ...",College of Arts and Sciences


In [148]:
import glob
csv_files = glob.glob("./Lous_List/*.csv") # Get all files within folder
courses = pd.concat(
    (pd.read_csv(f).assign(filename=os.path.basename(f)[:-4]) for f in csv_files),
    ignore_index=True
)

courses.head()

Unnamed: 0,ClassNumber,Mnemonic,Number,Section,Type,Units,Instructor,Days,Room,Title,Topic,Status,Enrollment,EnrollmentLimit,Waitlist,filename
0,10003,AAS,1010,100,Lecture,4,"Robert Vinson, Naseemah Mohamed",TuTh 12:30pm - 1:45pm,Minor Hall 125,Introduction to African-American and African S...,,Open,0,180,0,Fall_2025
1,12774,AAS,1010,101,Discussion,0,To Be Announced,We 6:00pm - 6:50pm,New Cabell Hall 283,Introduction to African-American and African S...,,Open,0,20,0,Fall_2025
2,10006,AAS,1010,102,Discussion,0,To Be Announced,We 5:00pm - 5:50pm,New Cabell Hall 287,Introduction to African-American and African S...,,Open,0,20,0,Fall_2025
3,10004,AAS,1010,103,Discussion,0,To Be Announced,Tu 7:00pm - 7:50pm,New Cabell Hall 411,Introduction to African-American and African S...,,Open,0,20,0,Fall_2025
4,10005,AAS,1010,104,Discussion,0,To Be Announced,Tu 8:00pm - 8:50pm,New Cabell Hall 411,Introduction to African-American and African S...,,Open,0,20,0,Fall_2025


# Store data in ChromaDB vectorized database

### Creating and Testing Embedding

In [17]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name = "all-MiniLM-L6-v2") # Load local embedding model
Settings.embed_model = embed_model
print("Embedding a test sentence:", Settings.embed_model.get_text_embedding("test sentence")[:5])

Embedding a test sentence: [0.042972829192876816, 0.0966348648071289, -0.002129161963239312, 0.07826831191778183, -0.006417457479983568]


### Creating ChromaDB and Saving Documents

#### 1. Majors Index

In [149]:
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

# Convert each row to a LlamaIndex document
documents = [
    Document(
        text = row["Text"],
        metadata = {"Major": row["Major"], "School": row["School"]}
    )
    for _, row in majors_meta.iterrows()
]

# 1. Initialize Chroma vector store for majors (persistent DB stored in ./chroma_db folder)
chroma_store = ChromaVectorStore.from_params(
    collection_name = "majors_collection",
    persist_dir = "./chroma_db"  # local directory for vector store
)

# 2. Create the majors index â€” this will auto-chunk + auto-embed
index = VectorStoreIndex.from_documents(documents, vector_store = chroma_store)
index.storage_context.persist() # 3. Persist to disk

#### 2. Courses Index

In [151]:
# Convert each row to a LlamaIndex document
course_docs = []
for _, row in courses.iterrows():
    course_text = f"""Course ID: {row['Mnemonic']+str(row['Number'])}
        Title: {row['Title']}
        Credits: {row['Units']},
        Instructor: {row['Instructor']},
        Schedule: {row['Days']},
        Period: {row['filename']}"""
    course_docs.append(Document(text=course_text, metadata=row.to_dict()))
    
# 1. Initialize Chroma vector store for courses
courses_chroma_store = ChromaVectorStore.from_params(
    collection_name = "courses_collection", 
    persist_dir = "./chroma_db")

# 2. Create the courses index
courses_index = VectorStoreIndex.from_documents(course_docs, vector_store = courses_chroma_store)
courses_index.storage_context.persist() # 3. Persist to disk

# Retrieval

In [164]:
from llama_index.core import StorageContext, load_index_from_storage

# 1. Reload the Index from Storage
#storage_context = StorageContext.from_defaults(persist_dir = "./chroma_db")
#index = load_index_from_storage(storage_context)

# 2. Set Up a Query Engine (Retriever + LLM)
retriever = index.as_retriever(similarity_top_k = 2, embed_model = embed_model)

query = "core courses for computer science major"
nodes = retriever.retrieve(query)

for i, node in enumerate(nodes, 1):
    print(f"\nðŸ“š Result {i}:")
    print(node.metadata)
    print(node.text[:500])


ðŸ“š Result 1:
{'Major': 'Computer Science, B.S.', 'School': 'School of Engineering and Applied Science'}
Some students with programming experience may wish to take the place-out test, which satisfies the requirement but does not award degree credit.Â  Please contact the department for more information about place-out tests. Choose one of the following: CS 1110Â -Â Introduction to Programming Credits:          3 CS 1111Â -Â Introduction to Programming Credits:          3 CS 1112Â -Â Introduction to Programming Credits:          3 CS 1113Â -Â Introduction to Programming Credits:          3 Foundation Courses 

ðŸ“š Result 2:
{'Major': 'Computer Science, B.A.', 'School': 'College of Arts and Sciences'}
Computer Science, B.A. Print-Friendly Page (opens a new window) Return to: College of Arts & Sciences: Degree Programs The BA degree program in Computer ScienceÂ provides students with a solid foundation in computer science theory and practice. An important goal of the program is give stu

In [165]:
# Querying the courses index
nodes = courses_index.as_retriever(similarity_top_k=3).retrieve("Data Science Spring 2025")

for i, node in enumerate(nodes, 1):
    print(f"\nðŸ“š Result {i}:")
    print(node.metadata)
    print(node.text[:500])


ðŸ“š Result 1:
{'ClassNumber': 12626, 'Mnemonic': 'STAT', 'Number': 1601, 'Section': '003', 'Type': 'Lecture', 'Units': '3', 'Instructor': 'Kristen Roland', 'Days': 'TuTh 2:00pm - 3:15pm', 'Room': 'Chemistry Bldg 217', 'Title': 'Introduction to Data Science with R', 'Topic': nan, 'Status': 'Closed', 'Enrollment': 92, 'EnrollmentLimit': 90, 'Waitlist': 0, 'filename': 'Spring_2025'}
Course ID: STAT1601
        Title: Introduction to Data Science with R
        Credits: 3,
        Instructor: Kristen Roland,
        Schedule: TuTh 2:00pm - 3:15pm,
        Period: Spring_2025

ðŸ“š Result 2:
{'ClassNumber': 21143, 'Mnemonic': 'DS', 'Number': 5111, 'Section': '001', 'Type': 'Lecture', 'Units': '3', 'Instructor': 'Jonathan Kropko', 'Days': 'Tu 8:30pm - 9:30pm', 'Room': 'Web-Based Course', 'Title': 'Streamlining Data Science with Software and Automation Skills', 'Topic': nan, 'Status': 'Open', 'Enrollment': 33, 'EnrollmentLimit': 60, 'Waitlist': 0, 'filename': 'Spring_2025'}
Course ID: DS511

# Retrieve relevant information from both indices

In [176]:
# Retrieve major requirements
major_retriever = index.as_retriever(similarity_top_k=1)
major_info = major_retriever.retrieve("I am a data science major")[0].get_content()

# Retrieve available courses (you can filter this by Spring 2025)
course_retriever = courses_index.as_retriever(similarity_top_k=20)
available_courses = course_retriever.retrieve("Spring 2025 data science courses")
available_course_texts = "\n\n".join([node.get_content() for node in available_courses])

# Generation

In [186]:
load_dotenv() # Load environment variables from .env file
token = os.getenv("HUGGINFACE_TOKEN") # Retrieve the token

# Log into huggingface
login(token = token)

# 1. Load model
llm = HuggingFaceLLM(model_name = "mistralai/Mistral-7B-Instruct-v0.1",
                     tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.1",
                     device_map = "auto",       # uses GPU if available, else CPU
                     max_new_tokens = 512,  # or even 1024
                     model_kwargs = {
                         "offload_folder": "./offload",  # If needed for disk-based offloading
                         "torch_dtype": torch.float16,
                         "trust_remote_code": True
                     },
                     generate_kwargs = {
                         "temperature": 0.1
                     },
                     system_prompt = SYSTEM_PROMPT)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [187]:
# Set system prompt
SYSTEM_PROMPT = """You are a helpful academic advisor at UVA. Given the student's major requirements and available courses, generate a schedule for Spring 2025. 
- Include only courses relevant to their major
- Avoid time conflicts
- Aim for 12 to 15 credits
- Each bullet should include: Course ID, Title, Days & Time, and Instructor.
- Include course title, schedule, and professor.
- Only use courses from the list provided below.

Format:

* <Course ID>: <Title> | <Schedule> | <Instructor>"""

# Combine system prompt with information retrieved from previous queries
full_prompt = f"""{SYSTEM_PROMPT}

Major Requirements:
{major_info}

Available Courses:
{available_course_texts}

Generate the schedule below:
"""

response = llm.complete(full_prompt)
print(response.text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



* <Course ID>: <Title> | <Schedule> | <Instructor>

Spring 2025 Schedule:

* DS5111: Streamlining Data Science with Software and Automation Skills | Tu 8:30pm - 9:30pm | Jonathan Kropko
* DS3025: Mathematics for Data Science | TuTh 11:00am - 12:15pm | Aaron Abrams
* DS1002: Programming for Data Science | TuTh 8:00am - 9:15am | Mai Dahshan
* STAT1601: Introduction to Data Science with R | TuTh 2:00pm - 3:15pm | Kristen Roland

Total Credits: 12

Note: The schedule is subject to change based on instructor availability and course availability. It is recommended to check the SIS for the most up-to-date schedule.


### Function to generate repsonse

In [193]:
def generate_response(query, k_s = 20):
    # Retrieve context on the major
    major_retriever = index.as_retriever(similarity_top_k=1)
    major_info = major_retriever.retrieve(query)[0].get_content()
    
    # Retrieve available courses
    course_retriever = courses_index.as_retriever(similarity_top_k=20)
    available_courses = course_retriever.retrieve(query)
    available_course_texts = "\n\n".join([node.get_content() for node in available_courses])

    # Construct full prompt with retrieved information
    full_prompt = f"""{SYSTEM_PROMPT}

    Major Requirements:
    {major_info}

    Available Courses:
    {available_course_texts}

    Generate the schedule below:
    """
    
    response = llm.complete(full_prompt) # prompt response
    return response.text

query = "I am a data scientist major, need schedule for Spring 2025"
print(generate_response(query))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



    <Course ID>: <Title> | <Schedule> | <Instructor>

    DS5111: Streamlining Data Science with Software and Automation Skills | Tu 8:30pm - 9:30pm | Jonathan Kropko

    DS3025: Mathematics for Data Science | TuTh 11:00am - 12:15pm | Aaron Abrams

    DS1002: Programming for Data Science | TuTh 8:00am - 9:15am | Mai Dahshan

    COMM3220: Data Management for Decision Making | TuTh 9:30am - 10:45am | David Schuff

    DS2002: Data Science Systems | MoWe 3:30pm - 4:45pm | Neal Magee

    DS6001: Data Engineering I: Data Pipeline Architecture | We 7:15pm - 8:15pm | Jonathan Kropko

    DS1002: Programming for Data Science | TuTh 12:30pm - 1:45pm | Mai Dahshan

    COMM3220: Data Management for Decision Making | MoWe 12:30pm - 1:45pm | Suprateek Sarker

    PLAN6040: Quantitative Methods of Planning Analysis | TuTh 9:30am - 10:45am | Andrew Mondschein

    DS2002: Data Science Systems | MoWe 3:30pm - 4:45pm | Jason Williamson

    IT3600: Data Analytics & Decision-Making | Mo 1:24am - 1