In [1]:
import json
import pandas as pd

# Load in the JSON file
with open("/home/msaad/workspace/honors-thesis/data-collection/data/categorized_data.json", "r") as f:
    data = json.load(f)

# Load in cleaned csv
df = pd.read_csv("/home/msaad/workspace/honors-thesis/data-collection/data/full_cleaned_data.csv")

In [2]:
data.keys()

dict_keys(['about', 'academics', 'admissions', 'admissions-aid', 'graduate', 'library', 'life', 'live', 'research-foundation', 'scholarships-aid', 'support'])

In [3]:
url_list = df['url'].to_list()

url_shortened_list = [url.split("brockport.edu")[1] for url in url_list]

# Pull out all the portions the URL goes through
split_urls = [[portion for portion in url.split("/") if portion != ''] for url in url_shortened_list]
first_portions = [portion[0] for portion in split_urls if len(portion) > 0]

In [4]:
pd.Series(first_portions).value_counts()

academics              1225
support                 485
live                    278
life                    274
about                   139
graduate                 50
admissions               48
admissions-aid           29
library                  20
research-foundation      17
scholarships-aid         12
Name: count, dtype: int64

# Try and build the search engine

In [28]:
import os

def save_to_txt(data: str, filename: str, category: str, subcategory: str = None):
    base_file_path = "/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data"
    
    filename = filename.removeprefix("https://www2.brockport.edu/").replace("/", "-")

    path = f"{base_file_path}/{category}{(f'/{subcategory}' if subcategory != None else '')}/"

    print(path)  

    if not os.path.exists(path):
        os.makedirs(path)

    with open(f"{path}{filename}.txt", "w") as f:
        f.write(data)

In [29]:
# Pull high level categories (i.e. about, academics, admissions, etc.)
for category, content in data.items():

    # Pull first level of category -- the "subcategory". 
    # This can point to either another category or data directly
    for subcategory, inner_content in content.items():

        # If its pointing to data directly, save it off
        if isinstance(inner_content, str):
            save_to_txt(inner_content, subcategory, category)
            continue
        
        # If it points to another category, pull the data from that category
        elif isinstance(inner_content, dict):
            for url, text in inner_content.items():
                save_to_txt(text, url, category, subcategory)
                continue

/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/
/home/msaad/workspace/honors-thesis/data-collectio

# Make empty folder set for chroma

In [39]:
import os

def save_to_txt(data: str, filename: str, category: str, subcategory: str = None):
    base_file_path = "/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data"
    
    filename = filename.removeprefix("https://www2.brockport.edu/").replace("/", "-")

    path = f"{base_file_path}/{category}{(f'/{subcategory}' if subcategory != None else '')}/"

    if not os.path.exists(path):
        os.makedirs(path)

    return path

In [40]:
# Pull high level categories (i.e. about, academics, admissions, etc.)
set_of_paths = set()
for category, content in data.items():

    # Pull first level of category -- the "subcategory". 
    # This can point to either another category or data directly
    for subcategory, inner_content in content.items():

        # If its pointing to data directly, save it off
        if isinstance(inner_content, str):
            set_of_paths.add(save_to_txt(inner_content, subcategory, category))
            continue
        
        # If it points to another category, pull the data from that category
        elif isinstance(inner_content, dict):
            for url, text in inner_content.items():
                set_of_paths.add(save_to_txt(text, url, category, subcategory))
                continue

In [41]:
set_of_paths # to be used later...

{'/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/diversity/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/middlestates/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/president/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/strategic-plan/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/title-ix/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/academics/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/academics/academic-affairs/',
 '/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/academics

# Lets see how we can load these using langchain now

In [7]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader


path = "/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/raw_data/about/"

loader = DirectoryLoader(path, glob="*.txt", loader_cls=TextLoader)

docs = loader.load()

len(docs)

29

In [8]:
[d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

['president', 'diversity', 'title-ix', 'strategic-plan', 'middlestates']

In [16]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

model = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
)

In [11]:
# Standardizing the content for each chunk
def standardize_string(input_string):
    # Step 1: Remove '\n' characters and replace them with periods
    standardized_string = input_string.replace('\n', ' ')

    # Step 2: Standardize the number of spaces
    standardized_string = re.sub(r'\s+', ' ', standardized_string)

    # Step 3: Remove non-alphanumeric characters at the start of the string
    standardized_string = re.sub(r'^[^a-zA-Z0-9]+', '', standardized_string)

    return standardized_string.strip()

In [12]:
# Chunk the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
texts = text_splitter.split_documents(docs)

In [17]:
# Create new 'texts' with some additional filters
texts_cleaned = []

# Iterate over texts page_content category with this cleaning method.
for id in range(len(texts)):
    texts[id].page_content = standardize_string(texts[id].page_content)

    if len(texts[id].page_content) > 100:
        texts_cleaned.append(texts[id])

In [36]:
vector_store_dir = path.replace("raw_data", "chroma_data")
vector_store_dir

'/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/about/'

In [38]:
vectordb = Chroma.from_documents(
    documents = texts_cleaned,
    embedding = model,
    persist_directory = vector_store_dir
)

vectordb.persist()

In [42]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

model = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
)

# Standardizing the content for each chunk
def standardize_string(input_string):
    # Step 1: Remove '\n' characters and replace them with periods
    standardized_string = input_string.replace('\n', ' ')

    # Step 2: Standardize the number of spaces
    standardized_string = re.sub(r'\s+', ' ', standardized_string)

    # Step 3: Remove non-alphanumeric characters at the start of the string
    standardized_string = re.sub(r'^[^a-zA-Z0-9]+', '', standardized_string)

    return standardized_string.strip()


def universal_make_vector_store(path: str, vector_store_dir: str):
    loader = DirectoryLoader(path, glob="*.txt", loader_cls=TextLoader)

    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    texts = text_splitter.split_documents(docs)

    # Create new 'texts' with some additional filters
    texts_cleaned = []

    # Iterate over texts page_content category with this cleaning method.
    for id in range(len(texts)):
        texts[id].page_content = standardize_string(texts[id].page_content)

        if len(texts[id].page_content) > 100:
            texts_cleaned.append(texts[id])

    vectordb = Chroma.from_documents(
        documents = texts_cleaned,
        embedding = model,
        persist_directory = vector_store_dir
    )

    vectordb.persist()

In [49]:
# Apply universal make vector store to all paths
for path in set_of_paths:
    universal_make_vector_store(path.replace("/chroma_data/", "/raw_data/"), path)

# Lets test it!

Now I have a vector db for every file. Lets test it on a few to see if things are working...

In [84]:
test_path = list(set_of_paths)[33]
test_path

'/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/academics/computing-sciences/'

In [85]:
# Configuration for semantic search
search_args = {
    "score_threshold": .8,
    "k": 3
}

# Initialize retriever using Chroma with OpenAI embeddings for semantic search
retriever = Chroma(
    persist_directory = test_path, 
    embedding_function = model
).as_retriever(search_kwargs = search_args)

# Can we build the search engine?

By implementing GPT4 categorization. Really though, this part is about building up the code to actually send a query to a location

In [101]:
data.keys()

dict_keys(['about', 'academics', 'admissions', 'admissions-aid', 'graduate', 'library', 'life', 'live', 'research-foundation', 'scholarships-aid', 'support'])

In [102]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [147]:
question = "How can I apply?"

initial_categorization_prompt = f"""\
The question is: 
{question}

The following categories available are:
"none": if the question does not fit into any of the above categories, or are not related to SUNY Brockport
"live": for policy related questions
"academics": academic related information to majors, or programs.
"support": current student and faculty support
"life": information about student life
"about": information about the university, such as Title IX, mission statement, diversity, or strategic plan, local area, president, etc.
"admissions": for prospective students looking to apply
"graduate": information about graduate programs
"admissions-aid": information about admissions and financial aid
"scholarships-aid": information about scholarships and financial aid
"library": information about the library
"research-foundation": information about the research at Brockport

Respond ONLY with the name of the category. (i.e. "live", "academics", etc.). If a question does not fit into any of the above categories, or is otherwise inappropriate, respond with "none".
"""

def categorize_question(prompt: str):
  response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
      {
        "role": "system",
        "content": "You are a helpful classification system. Categorize a question into its category based on the brief description provided."
      },
      {
        "role": "user",
        "content": prompt
      }
    ],
    temperature=0,
    max_tokens=10,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response.to_dict()['choices'][0]['message']['content']

In [120]:
first_category = categorize_question(initial_categorization_prompt)
first_category

'admissions'

In [119]:
subcategory_keys = data[first_category].keys()

# Create list of non-urls
dirs = [non_url for non_url in subcategory_keys if not non_url.startswith("http")]
dirs

['apply', 'information']

In [123]:
base_path = "/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/"
path_to_first_category = f"{base_path}{first_category}/"

# Configuration for semantic search
search_args = {
    "score_threshold": .8,
    "k": 3
}

# Initialize retriever using Chroma with OpenAI embeddings for semantic search
retriever = Chroma(
    persist_directory = path_to_first_category, 
    embedding_function = model
).as_retriever(search_kwargs = search_args)

In [140]:
vector_search_results = retriever.get_relevant_documents(question)
joined_search_results = " ".join([doc.page_content for doc in vector_search_results])
joined_search_results

'SINCE I DON’T NEED TO SUBMIT TEST SCORES, WHAT DO I NEED TO SUBMIT TO APPLY TO BROCKPORT? We will need three things from you: - Apply through the Common App or SUNY App - Official High School transcript - At least one letter of recommendation Apply First apply to Brockport. Then: land your dream career. An education from SUNY Brockport doesn’t just carry you through the classroom — it will carry you through the entirety of your career. Apply as a: Visit Our Admissions Advisors will review your application holistically. We will look at everything in your application - including your high school courses, involvement (such as community service, clubs, etc.), letter(s) of recommendation, and college essay.'

In [143]:
# Now generate a prompt for the subcategory, giving GPT the option to search more, or take its current information

dir_list = "\n".join(dirs)

subcategory_prompt = f"""\
The question is:
{question}

If the answer is available in the following data, answer the question. If not, choose one of the following subcategories.
The decision is yours whether to search more, or take your current information.

The current information is:
{joined_search_results}

The following subcategories available are:
{dir_list}

If you choose a subcategory, respond ONLY with the name of that category. (i.e. "live", "academics", etc.)."""

In [144]:
print(subcategory_prompt)

The question is:
How can I apply?

If the answer is available in the following data, answer the question. If not, choose one of the following subcategories.
The decision is yours whether to search more, or take your current information.

The current information is:
SINCE I DON’T NEED TO SUBMIT TEST SCORES, WHAT DO I NEED TO SUBMIT TO APPLY TO BROCKPORT? We will need three things from you: - Apply through the Common App or SUNY App - Official High School transcript - At least one letter of recommendation Apply First apply to Brockport. Then: land your dream career. An education from SUNY Brockport doesn’t just carry you through the classroom — it will carry you through the entirety of your career. Apply as a: Visit Our Admissions Advisors will review your application holistically. We will look at everything in your application - including your high school courses, involvement (such as community service, clubs, etc.), letter(s) of recommendation, and college essay.

The following subcate

In [149]:
final_category_or_answer = categorize_question(subcategory_prompt)

In [151]:
if ' ' in final_category_or_answer:
    answer = final_category_or_answer

else:
    second_category = final_category_or_answer
    data[first_category][second_category]


In [154]:
base_path = "/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/"
path_to_final_category = f"{base_path}{first_category}/{second_category}"
path_to_final_category

'/home/msaad/workspace/honors-thesis/data-collection/data/categorized_datastore/chroma_data/admissions/apply'

In [155]:
# Configuration for semantic search
search_args = {
    "score_threshold": .8,
    "k": 3
}

# Initialize retriever using Chroma with OpenAI embeddings for semantic search
retriever = Chroma(
    persist_directory = path_to_final_category, 
    embedding_function = model
).as_retriever(search_kwargs = search_args)

In [156]:
vector_search_results = retriever.get_relevant_documents(question)
joined_search_results = " ".join([doc.page_content for doc in vector_search_results])
joined_search_results

'How to Apply If you would like to apply to the University, please speak with an Undergraduate Admissions advisor prior to applying. - Submit an application and the $50 application fee. Submit official college transcripts from any institutions you have attended. How to Apply If you would like to be readmitted to the University, please speak with an Undergraduate Admissions advisor prior to applying. Submit a letter from a parent or school counselor stating that they are supportive of your interest in taking courses at the collegiate level. - Please bring the letter with you to the Office of Undergraduate Admissions upon applying. Non-Matriculated High School Student Application'

In [159]:
final_possible_prompt = f"""\
The question is:
{question}

If the answer is available in the following data, answer the question. If not, refuse to answer the question.

The current information is:
{joined_search_results}"""

In [160]:
print(final_possible_prompt)

The question is:
How can I apply?

If the answer is available in the following data, answer the question. If not, refuse to answer the question.

The current information is:
How to Apply If you would like to apply to the University, please speak with an Undergraduate Admissions advisor prior to applying. - Submit an application and the $50 application fee. Submit official college transcripts from any institutions you have attended. How to Apply If you would like to be readmitted to the University, please speak with an Undergraduate Admissions advisor prior to applying. Submit a letter from a parent or school counselor stating that they are supportive of your interest in taking courses at the collegiate level. - Please bring the letter with you to the Office of Undergraduate Admissions upon applying. Non-Matriculated High School Student Application


In [161]:
answer = categorize_question(final_possible_prompt)
answer

'The category for this question is: Education/University'

# Price of running

In [3]:
import openai
import os

openai.api_key = os.getenv("OPENAI_API_KEY")

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "Your a helpful assistant"},
        {"role": "user", "content": "Tell me about shakespeare"}
    ],
    temperature=0.8,
    max_tokens=100
)

In [8]:
response['usage']['prompt_tokens']

20

In [9]:
response['usage']['completion_tokens']

100

In [12]:
response['usage']['prompt_tokens'] * 0.03/1000 + response['usage']['completion_tokens'] * 0.06/1000

0.0066