In [1]:
import pandas as pd
import pickle
import requests
from bs4 import BeautifulSoup
import trafilatura
import re
import random
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import VectorDBQA
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA

df = pd.read_csv("./data/chunks_from_html.csv")

In [2]:
data_folder = "/home/msaad/workspace/honors-thesis/data-collection/data/"
responses_dict = pickle.load(open(data_folder + "scraper_output.p", "rb"))

def get_text(item: tuple[str, requests.models.Response]) -> list:
    """
    Parses HTML for 'sentences', as described above.
    """
    key, response = item

    # Your BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')

    # Create an empty list to hold the sentences
    long_texts = []

    # Define custom punctuation
    custom_punctuation = ',./?!:;$#&+*()"'

    # Loop through all the strings in the BeautifulSoup object
    for string in soup.stripped_strings:
        # Consider a long text as a text having 15 or more words
        if len(string.split()) >= 10: # NOTE July 17. Next run edit - 10 
            # Check if the string contains only alphanumeric, whitespace and custom punctuation characters
            if all(c.isalnum() or c.isspace() or c in custom_punctuation for c in string):
                # Check if the string contains more than 3 whitespaces in a row
                if not re.search(' {3,}', string):
                    long_texts.append(string)

    # NEW: Added so semantic search queries return bigger, more in context results.
    # "chunk" is a string of all the long sentences (as described above) separated by a space.
    chunk = "\n".join(long_texts)
    
    # OLD: return long_texts
    return chunk #key, 




def new_cleaning(response: requests.models.Response) -> str:
    return trafilatura.extract(response.text, include_tables=False)

In [3]:
urls = random.sample(list(responses_dict.keys()), 5)
urls

['https://www2.brockport.edu/about/town_gown/communications_outreach.html',
 'https://www2.brockport.edu/life/residential_life/housing_selection/spring',
 'https://www2.brockport.edu/academics/kinesiology/athletic-training/opportunities',
 'https://www2.brockport.edu/live/profiles/3381-eugeniya-iskrenova-ekiert',
 'https://www2.brockport.edu/support/institutional-review-board/proposal-guidance']

In [4]:
output = []
for url in urls:
    response = responses_dict[url]
    output.append([get_text((url, response)), new_cleaning(response)])

In [36]:
len(responses_dict.keys())

5413

In [45]:
[
word in 'test_admissions' for word in ]

[True, False]

In [81]:
'brockport.edu/life/' in 'https://www2.brockport.edu/life/welcome-week'

True

In [179]:
# Filter out to only successful requests (status code == 200)
res_dict = {}
for url, response in responses_dict.items():
    if response.status_code == 200:
        res_dict[url] = response

urls = res_dict.keys()

## Many of the URLs have .html on he end of it. This causes some duplication where there is a url with a .html and another without.
# This step filters them out, and standardizes it. If both exist, it'll remove the .html version, if only the .html version exists, it'll strip it and add it to the list.
html_urls = set()
non_html_urls = set()

for url in urls:
    if url.endswith('.html'):
        stripped_url = url[:-5]  # strip .html
        html_urls.add(stripped_url)
    else:
        non_html_urls.add(url)

# Filter out .html URLs if non-html version exists
urls = set(non_html_urls | (html_urls - non_html_urls))

# Standardize the _ and - to -. Many copies of URLs with differences.
urls = {url.replace('_', '-') for url in urls}

# Filter length of URL?
# urls = {url for url in urls if url.count('/') < 6}

# Get rid of "live". Just wayyyyy too many of them, mostly professor websites.
bad_list = ['brockport.edu/live/', 'brockport.edu/life/', 'archive', 'transfer-credit', 'research-foundation']
urls = {url for url in urls if all(word not in url for word in bad_list)}


# Important decision!! Going to limit URL length for all non-admissions/faid webpages. Need to filter this down more... A lot more... 
good_list = ['brockport.edu/admissions/', 'brockport.edu/academics/advisement/handbook']
new_url = set()
for url in urls:
    if any(word in url for word in good_list):
        new_url.add(url)
    elif url.count('/') < 5:
        new_url.add(url)
urls = new_url
        


len(urls)

300

In [180]:
sorted(urls)

['https://www2.brockport.edu',
 'https://www2.brockport.edu/about',
 'https://www2.brockport.edu/about/accountability',
 'https://www2.brockport.edu/about/brockport-downtown',
 'https://www2.brockport.edu/about/contact-us',
 'https://www2.brockport.edu/about/diversity',
 'https://www2.brockport.edu/about/economic-impact',
 'https://www2.brockport.edu/about/facts',
 'https://www2.brockport.edu/about/location',
 'https://www2.brockport.edu/about/lodging',
 'https://www2.brockport.edu/about/middlestates',
 'https://www2.brockport.edu/about/president',
 'https://www2.brockport.edu/about/privacy',
 'https://www2.brockport.edu/about/strategic-plan',
 'https://www2.brockport.edu/about/sustainability',
 'https://www2.brockport.edu/about/title-ix',
 'https://www2.brockport.edu/about/town-gown',
 'https://www2.brockport.edu/academics',
 'https://www2.brockport.edu/academics/academic-affairs',
 'https://www2.brockport.edu/academics/advisement',
 'https://www2.brockport.edu/academics/advisement/ha

In [174]:
for item in list(responses_dict.values()):
    if item.status_code != 200:
        print(item.status_code)

404
404
403
404
404
404
404
404
404
404
404
404
404
404
404
403
404
404
404
404
404
404
404
403
404
404
403
404
404
404
404
404
404
404
404
404
404
404
404
404
403
403
403
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
404
403
404


In [93]:
urls = responses_dict.keys()

## Many of the URLs have .html on he end of it. This causes some duplication where there is a url with a .html and another without.
# This step filters them out, and standardizes it. If both exist, it'll remove the .html version, if only the .html version exists, it'll strip it and add it to the list.
html_urls = set()
non_html_urls = set()

for url in urls:
    if url.endswith('.html'):
        stripped_url = url[:-5]  # strip .html
        html_urls.add(stripped_url)
    else:
        non_html_urls.add(url)

# Filter out .html URLs if non-html version exists
urls = set(non_html_urls | (html_urls - non_html_urls))

# standardize the _ and - 
urls = [url.replace('_', '-') for url in urls]



urls = set([key for key in urls for word in ['brockport.edu/live/', 'brockport.edu/life/'] if word not in key and key.count('/') < 5 and key.count('?') == 0])

for url in sorted(final):
    print(url)

https://www2.brockport.edu
https://www2.brockport.edu/about
https://www2.brockport.edu/about/accountability
https://www2.brockport.edu/about/archives
https://www2.brockport.edu/about/brockport-downtown
https://www2.brockport.edu/about/college-calendar
https://www2.brockport.edu/about/contact-us
https://www2.brockport.edu/about/diversity
https://www2.brockport.edu/about/economic-impact
https://www2.brockport.edu/about/facts
https://www2.brockport.edu/about/location
https://www2.brockport.edu/about/lodging
https://www2.brockport.edu/about/middlestates
https://www2.brockport.edu/about/president
https://www2.brockport.edu/about/privacy
https://www2.brockport.edu/about/strategic-plan
https://www2.brockport.edu/about/sustainability
https://www2.brockport.edu/about/title-ix
https://www2.brockport.edu/about/town-gown
https://www2.brockport.edu/academics
https://www2.brockport.edu/academics/academic-affairs
https://www2.brockport.edu/academics/accounting
https://www2.brockport.edu/academics/adv

In [92]:
len(final)

249

In [89]:
len(final)

469

In [46]:
len([key for key in responses_dict.keys() if word in key for word in ['admissions', 'academics']])

UnboundLocalError: cannot access local variable 'word' where it is not associated with a value

In [35]:
for key in sorted(responses_dict.keys()):
    print(key)

https://www2.brockport.edu
https://www2.brockport.edu/about
https://www2.brockport.edu/about/accountability
https://www2.brockport.edu/about/accountability/academic-assessment
https://www2.brockport.edu/about/accountability/academic_assessment.html
https://www2.brockport.edu/about/accountability/committees
https://www2.brockport.edu/about/accountability/directory
https://www2.brockport.edu/about/accountability/strategic-planning
https://www2.brockport.edu/about/accountability/strategic_planning.html
https://www2.brockport.edu/about/accountability/suny
https://www2.brockport.edu/about/archives
https://www2.brockport.edu/about/archives/about
https://www2.brockport.edu/about/archives/college-history
https://www2.brockport.edu/about/archives/college_history.html
https://www2.brockport.edu/about/archives/past-presidents
https://www2.brockport.edu/about/brockport-downtown
https://www2.brockport.edu/about/brockport-downtown/directory
https://www2.brockport.edu/about/brockport-downtown/event-s

In [5]:
urls = [key for key in responses_dict.keys() if '.edu/admissions/apply' in key]


## Many of the URLs have .html on he end of it. This causes some duplication where there is a url with a .html and another without.
# This step filters them out, and standardizes it. If both exist, it'll remove the .html version, if only the .html version exists, it'll strip it and add it to the list.
html_urls = set()
non_html_urls = set()

for url in urls:
    if url.endswith('.html'):
        stripped_url = url[:-5]  # strip .html
        html_urls.add(stripped_url)
    else:
        non_html_urls.add(url)

# Filter out .html URLs if non-html version exists
final_urls = list(non_html_urls | (html_urls - non_html_urls))
final_urls

['https://www2.brockport.edu/admissions/apply',
 'https://www2.brockport.edu/admissions/apply/second_degree',
 'https://www2.brockport.edu/admissions/apply/index',
 'https://www2.brockport.edu/admissions/apply-to-brockport',
 'https://www2.brockport.edu/admissions/apply/special-admission-requirements',
 'https://www2.brockport.edu/admissions/apply/dual-admission',
 'https://www2.brockport.edu/admissions/apply/first-year',
 'https://www2.brockport.edu/admissions/apply/non_degree',
 'https://www2.brockport.edu/admissions/apply/readmission',
 'https://www2.brockport.edu/admissions/apply/second-degree',
 'https://www2.brockport.edu/admissions/apply/special-programs',
 'https://www2.brockport.edu/admissions/apply/update',
 'https://www2.brockport.edu/admissions/apply/dual_admission',
 'https://www2.brockport.edu/admissions/apply/special_programs',
 'https://www2.brockport.edu/admissions/apply/first_year',
 'https://www2.brockport.edu/admissions/apply/special_admission_requirements',
 'https

In [6]:
# Another somewhat similar thing. I have URLs which use _ and - in the URL but point to the same webpage.
# This filters it all out.
final = list(set([url.replace('_', '-') for url in final_urls]))
final

['https://www2.brockport.edu/admissions/apply',
 'https://www2.brockport.edu/admissions/apply/readmission',
 'https://www2.brockport.edu/admissions/apply/index',
 'https://www2.brockport.edu/admissions/apply/second-degree',
 'https://www2.brockport.edu/admissions/apply/faq',
 'https://www2.brockport.edu/admissions/apply/special-programs',
 'https://www2.brockport.edu/admissions/apply/update',
 'https://www2.brockport.edu/admissions/apply/status',
 'https://www2.brockport.edu/admissions/apply-to-brockport',
 'https://www2.brockport.edu/admissions/apply/special-admission-requirements',
 'https://www2.brockport.edu/admissions/apply/transfer',
 'https://www2.brockport.edu/admissions/apply/dual-admission',
 'https://www2.brockport.edu/admissions/apply/first-year',
 'https://www2.brockport.edu/admissions/apply/non-degree']

In [181]:


responses_dict['https://www2.brockport.edu/go/update-directory.html']

KeyError: 'https://www2.brockport.edu/go/update-directory.html'

In [8]:
import csv

# Open the CSV file in write mode
with open('urls2.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    result_list = []
    # Write each URL to a new row
    for url in final:
        # Not great way, but a way to do this. I removed all the .html ones, but still need to access their contents in dictionary.
        # I'll probably change this to make it cleaner in the future.
        if url in responses_dict.keys():
            result = trafilatura.extract(responses_dict[url].text)
        else:
            result = trafilatura.extract(responses_dict[url+'.html'].text)
        
        if result not in result_list:
            result_list.append(result)
        else:
            final.remove(url)
    
    writer.writerow(['\n'])
    for url, result in zip(final, result_list):
        writer.writerow([result])

In [9]:
final

['https://www2.brockport.edu/admissions/apply',
 'https://www2.brockport.edu/admissions/apply/readmission',
 'https://www2.brockport.edu/admissions/apply/second-degree',
 'https://www2.brockport.edu/admissions/apply/faq',
 'https://www2.brockport.edu/admissions/apply/special-programs',
 'https://www2.brockport.edu/admissions/apply/update',
 'https://www2.brockport.edu/admissions/apply/status',
 'https://www2.brockport.edu/admissions/apply-to-brockport',
 'https://www2.brockport.edu/admissions/apply/special-admission-requirements',
 'https://www2.brockport.edu/admissions/apply/transfer',
 'https://www2.brockport.edu/admissions/apply/dual-admission',
 'https://www2.brockport.edu/admissions/apply/first-year',
 'https://www2.brockport.edu/admissions/apply/non-degree']

In [10]:
pd.read_csv("./urls2.csv")

Unnamed: 0,\n
0,SAT/ACT Test Optional\nSUNY Brockport is SAT/A...
1,Readmission to the University is necessary for...
2,I am a freshman applicant. My file is complete...
3,"No two students are alike, and there is no one..."
4,Students may update their admissions applicati...
5,If this is your first visit time logging in to...
6,Student Type\nFirst-time Student\nIf you are a...
7,A separate application or prerequisite courses...
8,"At Brockport, you’ll find an affordable, trans..."
9,Requirements for Dual Admission\nTo be eligibl...


In [11]:
from langchain.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path="./urls2.csv")
data = loader.load()

In [12]:
data

[Document(page_content=': SAT/ACT Test Optional\nSUNY Brockport is SAT/ACT test-optional. Brockport uses a holistic, full-file application review process for both admission and merit-based scholarship consideration.\nChoose Your Application\nPlease choose only ONE of the following applications to fill out. Both applications are equally accepted.\nTrack your Progress\nYou can track your application status in our online system and check for any missing documents or additional required items.\nOther Audiences\nStudents wishing to take up to 24 credits, non-degree seeking.\nStudents who have separated from the University because of an expired leave of absence, graduation, or academic dismissal.\nVisiting students are matriculated at another college but are taking classes at SUNY Brockport.\nStudents who have earned an undergraduate degree and wish to earn a second baccalaureate degree.\nStudents who have served or are serving in the US Armed Forces.\nStudents who are enrolled in a Dual Adm

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, 
    chunk_overlap=50
)
texts = text_splitter.split_documents(data)

In [14]:
import re

def standardize_string(input_string):
    # Step 1: Remove '\n' characters and replace them with periods
    standardized_string = input_string.replace('\n', ' ')

    # Step 2: Standardize the number of spaces
    standardized_string = re.sub(r'\s+', ' ', standardized_string)

    # Step 3: Remove non-alphanumeric characters at the start of the string
    standardized_string = re.sub(r'^[^a-zA-Z0-9]+', '', standardized_string)

    return standardized_string.strip()

In [15]:
# Create new 'texts' with some additional filters
texts_cleaned = []

# Iterate over texts page_content category with this cleaning method.
for id in range(len(texts)):
    texts[id].page_content = standardize_string(texts[id].page_content)

    if len(texts[id].page_content) > 100:
        texts_cleaned.append(texts[id])

texts_cleaned

[Document(page_content='SAT/ACT Test Optional SUNY Brockport is SAT/ACT test-optional. Brockport uses a holistic, full-file application review process for both admission and merit-based scholarship consideration. Choose Your Application', metadata={'source': './urls2.csv', 'row': 0}),
 Document(page_content='Choose Your Application Please choose only ONE of the following applications to fill out. Both applications are equally accepted. Track your Progress You can track your application status in our online system and check for any missing documents or additional required items. Other Audiences', metadata={'source': './urls2.csv', 'row': 0}),
 Document(page_content='Other Audiences Students wishing to take up to 24 credits, non-degree seeking. Students who have separated from the University because of an expired leave of absence, graduation, or academic dismissal. Visiting students are matriculated at another college but are taking classes at SUNY Brockport.', metadata={'source': './url

# Now Create Embeddings

Going to try openai embeddings this time around, under Morgan's suggestion that they're the best

In [16]:
embedding_function = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents = texts_cleaned,
    embedding = embedding_function
)

In [17]:
retreiver = vectordb.as_retriever()
docs = retreiver.get_relevant_documents("Where do I apply")

docs

[Document(page_content='Choose Your Application Please choose only ONE of the following applications to fill out. Both applications are equally accepted. Track your Progress You can track your application status in our online system and check for any missing documents or additional required items. Other Audiences', metadata={'source': './urls2.csv', 'row': 0}),
 Document(page_content='How to Apply If you would like to be readmitted to the University, please speak with an Undergraduate Admissions advisor prior to applying.', metadata={'source': './urls2.csv', 'row': 1}),
 Document(page_content='Once you have started your application in either the Common Application or the SUNY Application, you need to submit the required documents in order for your application to be processed. Work with your school counselor to ensure all appropriate materials are submitted to our office.', metadata={'source': './urls2.csv', 'row': 10}),
 Document(page_content='Applicants for the fall semester can acces

In [18]:
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retreiver,
    return_source_documents=True,
    verbose=True
)

# Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [31]:
query = "I got accepted! Can you give me a list of what are my next steps are?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 Congratulations! Once you have accepted your offer of admission, your next steps will vary depending on your program. Please refer to the New Student Checklist for a comprehensive list of tasks to complete prior to enrolling in your program. You can find the New Student Checklist at [link].


Sources:
./urls2.csv
./urls2.csv
./urls2.csv
./urls2.csv
